Index: head/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c
===================================================================
--- head/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c	(revision 186221)
+++ head/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c	(revision 186222)
@@ -1,4471 +1,4468 @@
 /**************************************************************************
 
 Copyright (c) 2007-2008, Chelsio Inc.
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are met:
 
  1. Redistributions of source code must retain the above copyright notice,
     this list of conditions and the following disclaimer.
 
  2. Neither the name of the Chelsio Corporation nor the names of its
     contributors may be used to endorse or promote products derived from
     this software without specific prior written permission.
 
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 POSSIBILITY OF SUCH DAMAGE.
 
 ***************************************************************************/
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/fcntl.h>
 #include <sys/kernel.h>
 #include <sys/limits.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/mbuf.h>
 #include <sys/mutex.h>
 #include <sys/sockstate.h>
 #include <sys/sockopt.h>
 #include <sys/socket.h>
 #include <sys/sockbuf.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 #include <sys/protosw.h>
 #include <sys/priv.h>
 
 #if __FreeBSD_version >= 800044
 #include <sys/vimage.h>
 #else
 #define V_tcp_do_autosndbuf tcp_do_autosndbuf
 #define V_tcp_autosndbuf_max tcp_autosndbuf_max
 #define V_tcp_do_rfc1323 tcp_do_rfc1323
 #define V_tcp_do_autorcvbuf tcp_do_autorcvbuf
 #define V_tcp_autorcvbuf_max tcp_autorcvbuf_max
 #define V_tcpstat tcpstat
 #endif
 
 #include <net/if.h>
 #include <net/route.h>
 
 #include <netinet/in.h>
 #include <netinet/in_pcb.h>
 #include <netinet/in_systm.h>
 #include <netinet/in_var.h>
 
 
 #include <cxgb_osdep.h>
 #include <sys/mbufq.h>
 
 #include <netinet/ip.h>
 #include <netinet/tcp_var.h>
 #include <netinet/tcp_fsm.h>
 #include <netinet/tcp_offload.h>
 #include <netinet/tcp_seq.h>
 #include <netinet/tcp_syncache.h>
 #include <netinet/tcp_timer.h>
 #if __FreeBSD_version >= 800056
 #include <netinet/vinet.h>
 #endif
 #include <net/route.h>
 
 #include <t3cdev.h>
 #include <common/cxgb_firmware_exports.h>
 #include <common/cxgb_t3_cpl.h>
 #include <common/cxgb_tcb.h>
 #include <common/cxgb_ctl_defs.h>
 #include <cxgb_offload.h>
 #include <vm/vm.h>
 #include <vm/pmap.h>
 #include <machine/bus.h>
 #include <sys/mvec.h>
 #include <ulp/toecore/cxgb_toedev.h>
 #include <ulp/tom/cxgb_l2t.h>
 #include <ulp/tom/cxgb_defs.h>
 #include <ulp/tom/cxgb_tom.h>
 #include <ulp/tom/cxgb_t3_ddp.h>
 #include <ulp/tom/cxgb_toepcb.h>
 #include <ulp/tom/cxgb_tcp.h>
 #include <ulp/tom/cxgb_tcp_offload.h>
 
 /*
  * For ULP connections HW may add headers, e.g., for digests, that aren't part
  * of the messages sent by the host but that are part of the TCP payload and
  * therefore consume TCP sequence space.  Tx connection parameters that
  * operate in TCP sequence space are affected by the HW additions and need to
  * compensate for them to accurately track TCP sequence numbers. This array
  * contains the compensating extra lengths for ULP packets.  It is indexed by
  * a packet's ULP submode.
  */
 const unsigned int t3_ulp_extra_len[] = {0, 4, 4, 8};
 
 #ifdef notyet
 /*
  * This sk_buff holds a fake header-only TCP segment that we use whenever we
  * need to exploit SW TCP functionality that expects TCP headers, such as
  * tcp_create_openreq_child().  It's a RO buffer that may be used by multiple
  * CPUs without locking.
  */
 static struct mbuf *tcphdr_mbuf __read_mostly;
 #endif
 
 /*
  * Size of WRs in bytes.  Note that we assume all devices we are handling have
  * the same WR size.
  */
 static unsigned int wrlen __read_mostly;
 
 /*
  * The number of WRs needed for an skb depends on the number of page fragments
  * in the skb and whether it has any payload in its main body.  This maps the
  * length of the gather list represented by an skb into the # of necessary WRs.
  */
 static unsigned int mbuf_wrs[TX_MAX_SEGS + 1] __read_mostly;
 
 /*
  * Max receive window supported by HW in bytes.  Only a small part of it can
  * be set through option0, the rest needs to be set through RX_DATA_ACK.
  */
 #define MAX_RCV_WND ((1U << 27) - 1)
 
 /*
  * Min receive window.  We want it to be large enough to accommodate receive
  * coalescing, handle jumbo frames, and not trigger sender SWS avoidance.
  */
 #define MIN_RCV_WND (24 * 1024U)
 #define INP_TOS(inp) ((inp_ip_tos_get(inp) >> 2) & M_TOS)
 
 #define VALIDATE_SEQ 0
 #define VALIDATE_SOCK(so)
 #define DEBUG_WR 0
 
 #define TCP_TIMEWAIT	1
 #define TCP_CLOSE	2
 #define TCP_DROP	3
 
 static void t3_send_reset(struct toepcb *toep);
 static void send_abort_rpl(struct mbuf *m, struct toedev *tdev, int rst_status);
 static inline void free_atid(struct t3cdev *cdev, unsigned int tid);
 static void handle_syncache_event(int event, void *arg);
 
 static inline void
 SBAPPEND(struct sockbuf *sb, struct mbuf *n)
 {
 	struct mbuf *m;
 
 	m = sb->sb_mb;
 	while (m) {
 		KASSERT(((m->m_flags & M_EXT) && (m->m_ext.ext_type == EXT_EXTREF)) ||
 		    !(m->m_flags & M_EXT), ("unexpected type M_EXT=%d ext_type=%d m_len=%d\n",
 			!!(m->m_flags & M_EXT), m->m_ext.ext_type, m->m_len));
 		KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x",
 			m->m_next, m->m_nextpkt, m->m_flags));
 		m = m->m_next;
 	}
 	m = n;
 	while (m) {
 		KASSERT(((m->m_flags & M_EXT) && (m->m_ext.ext_type == EXT_EXTREF)) ||
 		    !(m->m_flags & M_EXT), ("unexpected type M_EXT=%d ext_type=%d m_len=%d\n",
 			!!(m->m_flags & M_EXT), m->m_ext.ext_type, m->m_len));
 		KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x",
 			m->m_next, m->m_nextpkt, m->m_flags));
 		m = m->m_next;
 	}
 	KASSERT(sb->sb_flags & SB_NOCOALESCE, ("NOCOALESCE not set"));
 	sbappendstream_locked(sb, n);
 	m = sb->sb_mb;
 
 	while (m) {
 		KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x",
 			m->m_next, m->m_nextpkt, m->m_flags));
 		m = m->m_next;
 	}
 }
 
 static inline int
 is_t3a(const struct toedev *dev)
 {
 	return (dev->tod_ttid == TOE_ID_CHELSIO_T3);
 }
 
 static void
 dump_toepcb(struct toepcb *toep)
 {
 	DPRINTF("qset_idx=%d qset=%d ulp_mode=%d mtu_idx=%d tid=%d\n",
 	    toep->tp_qset_idx, toep->tp_qset, toep->tp_ulp_mode,
 	    toep->tp_mtu_idx, toep->tp_tid);
 
 	DPRINTF("wr_max=%d wr_avail=%d wr_unacked=%d mss_clamp=%d flags=0x%x\n",
 	    toep->tp_wr_max, toep->tp_wr_avail, toep->tp_wr_unacked, 
 	    toep->tp_mss_clamp, toep->tp_flags);
 }
 
 #ifndef RTALLOC2_DEFINED
 static struct rtentry *
 rtalloc2(struct sockaddr *dst, int report, u_long ignflags)
 {
 	struct rtentry *rt = NULL;
 	
 	if ((rt = rtalloc1(dst, report, ignflags)) != NULL)
 		RT_UNLOCK(rt);
 
 	return (rt);
 }
 #endif
 
 /*
  * Determine whether to send a CPL message now or defer it.  A message is
  * deferred if the connection is in SYN_SENT since we don't know the TID yet.
  * For connections in other states the message is sent immediately.
  * If through_l2t is set the message is subject to ARP processing, otherwise
  * it is sent directly.
  */
 static inline void
 send_or_defer(struct toepcb *toep, struct mbuf *m, int through_l2t)
 {
 	struct tcpcb *tp = toep->tp_tp;
 
 	if (__predict_false(tp->t_state == TCPS_SYN_SENT)) {
 		inp_wlock(tp->t_inpcb);
 		mbufq_tail(&toep->out_of_order_queue, m);  // defer
 		inp_wunlock(tp->t_inpcb);
 	} else if (through_l2t)
 		l2t_send(TOEP_T3C_DEV(toep), m, toep->tp_l2t);  // send through L2T
 	else
 		cxgb_ofld_send(TOEP_T3C_DEV(toep), m);          // send directly
 }
 
 static inline unsigned int
 mkprio(unsigned int cntrl, const struct toepcb *toep)
 {
         return (cntrl);
 }
 
 /*
  * Populate a TID_RELEASE WR.  The skb must be already propely sized.
  */
 static inline void
 mk_tid_release(struct mbuf *m, const struct toepcb *toep, unsigned int tid)
 {
 	struct cpl_tid_release *req;
 
 	m_set_priority(m, mkprio(CPL_PRIORITY_SETUP, toep));
 	m->m_pkthdr.len = m->m_len = sizeof(*req);
 	req = mtod(m, struct cpl_tid_release *);
 	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
 	req->wr.wr_lo = 0;
 	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_TID_RELEASE, tid));
 }
 
 static inline void
 make_tx_data_wr(struct socket *so, struct mbuf *m, int len, struct mbuf *tail)
 {
 	INIT_VNET_INET(so->so_vnet);
 	struct tcpcb *tp = so_sototcpcb(so);
 	struct toepcb *toep = tp->t_toe;
 	struct tx_data_wr *req;
 	struct sockbuf *snd;
 	
 	inp_lock_assert(tp->t_inpcb);
 	snd = so_sockbuf_snd(so);
 	
 	req = mtod(m, struct tx_data_wr *);
 	m->m_len = sizeof(*req);
 	req->wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_TX_DATA));
 	req->wr_lo = htonl(V_WR_TID(toep->tp_tid));
 	/* len includes the length of any HW ULP additions */
 	req->len = htonl(len);
 	req->param = htonl(V_TX_PORT(toep->tp_l2t->smt_idx));
 	/* V_TX_ULP_SUBMODE sets both the mode and submode */
 	req->flags = htonl(V_TX_ULP_SUBMODE(/*skb_ulp_mode(skb)*/ 0) |
 	                   V_TX_URG(/* skb_urgent(skb) */ 0 ) |
 	                   V_TX_SHOVE((!(tp->t_flags & TF_MORETOCOME) &&
 				   (tail ? 0 : 1))));
 	req->sndseq = htonl(tp->snd_nxt);
 	if (__predict_false((toep->tp_flags & TP_DATASENT) == 0)) {
 		req->flags |= htonl(V_TX_ACK_PAGES(2) | F_TX_INIT | 
 				    V_TX_CPU_IDX(toep->tp_qset));
  
 		/* Sendbuffer is in units of 32KB.
 		 */
 		if (V_tcp_do_autosndbuf && snd->sb_flags & SB_AUTOSIZE) 
 			req->param |= htonl(V_TX_SNDBUF(V_tcp_autosndbuf_max >> 15));
 		else {
 			req->param |= htonl(V_TX_SNDBUF(snd->sb_hiwat >> 15));
 		}
 		
 		toep->tp_flags |= TP_DATASENT;
 	}
 }
 
 #define IMM_LEN 64 /* XXX - see WR_LEN in the cxgb driver */
 
 int
 t3_push_frames(struct socket *so, int req_completion)
 {
 	struct tcpcb *tp = so_sototcpcb(so);
 	struct toepcb *toep = tp->t_toe;
 	
 	struct mbuf *tail, *m0, *last;
 	struct t3cdev *cdev;
 	struct tom_data *d;
 	int state, bytes, count, total_bytes;
 	bus_dma_segment_t segs[TX_MAX_SEGS], *segp;
 	struct sockbuf *snd;
 	
 	if (tp->t_state == TCPS_SYN_SENT || tp->t_state == TCPS_CLOSED) {
 		DPRINTF("tcp state=%d\n", tp->t_state);	
 		return (0);
 	}	
 
 	state = so_state_get(so);
 	
 	if (state & (SS_ISDISCONNECTING|SS_ISDISCONNECTED)) {
 		DPRINTF("disconnecting\n");
 		
 		return (0);
 	}
 
 	inp_lock_assert(tp->t_inpcb);
 
 	snd = so_sockbuf_snd(so);
 	sockbuf_lock(snd);
 
 	d = TOM_DATA(toep->tp_toedev);
 	cdev = d->cdev;
 
 	last = tail = snd->sb_sndptr ? snd->sb_sndptr : snd->sb_mb;
 
 	total_bytes = 0;
 	DPRINTF("wr_avail=%d tail=%p snd.cc=%d tp_last=%p\n",
 	    toep->tp_wr_avail, tail, snd->sb_cc, toep->tp_m_last);
 
 	if (last && toep->tp_m_last == last  && snd->sb_sndptroff != 0) {
 		KASSERT(tail, ("sbdrop error"));
 		last = tail = tail->m_next;
 	}
 
 	if ((toep->tp_wr_avail == 0 ) || (tail == NULL)) {
 		DPRINTF("wr_avail=%d tail=%p\n", toep->tp_wr_avail, tail);
 		sockbuf_unlock(snd);
 
 		return (0);		
 	}
 			
 	toep->tp_m_last = NULL;
 	while (toep->tp_wr_avail && (tail != NULL)) {
 		count = bytes = 0;
 		segp = segs;
 		if ((m0 = m_gethdr(M_NOWAIT, MT_DATA)) == NULL) {
 			sockbuf_unlock(snd);
 			return (0);
 		}
 		/*
 		 * If the data in tail fits as in-line, then
 		 * make an immediate data wr.
 		 */
 		if (tail->m_len <= IMM_LEN) {
 			count = 1;
 			bytes = tail->m_len;
 			last = tail;
 			tail = tail->m_next;
 			m_set_sgl(m0, NULL);
 			m_set_sgllen(m0, 0);
 			make_tx_data_wr(so, m0, bytes, tail);
 			m_append(m0, bytes, mtod(last, caddr_t));
 			KASSERT(!m0->m_next, ("bad append"));
 		} else {
 			while ((mbuf_wrs[count + 1] <= toep->tp_wr_avail)
 			    && (tail != NULL) && (count < TX_MAX_SEGS-1)) {
 				bytes += tail->m_len;
 				last = tail;
 				count++;
 				/*
 				 * technically an abuse to be using this for a VA
 				 * but less gross than defining my own structure
 				 * or calling pmap_kextract from here :-|
 				 */
 				segp->ds_addr = (bus_addr_t)tail->m_data;
 				segp->ds_len = tail->m_len;
 				DPRINTF("count=%d wr_needed=%d ds_addr=%p ds_len=%d\n",
 				    count, mbuf_wrs[count], tail->m_data, tail->m_len);
 				segp++;
 				tail = tail->m_next;
 			}
 			DPRINTF("wr_avail=%d mbuf_wrs[%d]=%d tail=%p\n",
 			    toep->tp_wr_avail, count, mbuf_wrs[count], tail);	
 
 			m_set_sgl(m0, segs);
 			m_set_sgllen(m0, count);
 			make_tx_data_wr(so, m0, bytes, tail);
 		}
 		m_set_priority(m0, mkprio(CPL_PRIORITY_DATA, toep));
 
 		if (tail) {
 			snd->sb_sndptr = tail;
 			toep->tp_m_last = NULL;
 		} else 
 			toep->tp_m_last = snd->sb_sndptr = last;
 
 
 		DPRINTF("toep->tp_m_last=%p\n", toep->tp_m_last);
 
 		snd->sb_sndptroff += bytes;
 		total_bytes += bytes;
 		toep->tp_write_seq += bytes;
 		CTR6(KTR_TOM, "t3_push_frames: wr_avail=%d mbuf_wrs[%d]=%d"
 		    " tail=%p sndptr=%p sndptroff=%d",
 		    toep->tp_wr_avail, count, mbuf_wrs[count],
 		    tail, snd->sb_sndptr, snd->sb_sndptroff);	
 		if (tail)
 			CTR4(KTR_TOM, "t3_push_frames: total_bytes=%d"
 			    " tp_m_last=%p tailbuf=%p snd_una=0x%08x",
 			    total_bytes, toep->tp_m_last, tail->m_data,
 			    tp->snd_una);
 		else
 			CTR3(KTR_TOM, "t3_push_frames: total_bytes=%d"
 			    " tp_m_last=%p snd_una=0x%08x",
 			    total_bytes, toep->tp_m_last, tp->snd_una);
 
 
 #ifdef KTR		
 {
 		int i;
 
 		i = 0;
 		while (i < count && m_get_sgllen(m0)) {
 			if ((count - i) >= 3) {
 				CTR6(KTR_TOM,
 				    "t3_push_frames: pa=0x%zx len=%d pa=0x%zx"
 				    " len=%d pa=0x%zx len=%d",
 				    segs[i].ds_addr, segs[i].ds_len,
 				    segs[i + 1].ds_addr, segs[i + 1].ds_len,
 				    segs[i + 2].ds_addr, segs[i + 2].ds_len);
 				    i += 3;
 			} else if ((count - i) == 2) {
 				CTR4(KTR_TOM, 
 				    "t3_push_frames: pa=0x%zx len=%d pa=0x%zx"
 				    " len=%d",
 				    segs[i].ds_addr, segs[i].ds_len,
 				    segs[i + 1].ds_addr, segs[i + 1].ds_len);
 				    i += 2;
 			} else {
 				CTR2(KTR_TOM, "t3_push_frames: pa=0x%zx len=%d",
 				    segs[i].ds_addr, segs[i].ds_len);
 				i++;
 			}
 	
 		}
 }
 #endif		
                  /*
 		 * remember credits used
 		 */
 		m0->m_pkthdr.csum_data = mbuf_wrs[count];
 		m0->m_pkthdr.len = bytes;
 		toep->tp_wr_avail -= mbuf_wrs[count];
 		toep->tp_wr_unacked += mbuf_wrs[count];
 		
 		if ((req_completion && toep->tp_wr_unacked == mbuf_wrs[count]) ||
 		    toep->tp_wr_unacked >= toep->tp_wr_max / 2) {
 			struct work_request_hdr *wr = cplhdr(m0);
 
 			wr->wr_hi |= htonl(F_WR_COMPL);
 			toep->tp_wr_unacked = 0;	
 		}
 		KASSERT((m0->m_pkthdr.csum_data > 0) &&
 		    (m0->m_pkthdr.csum_data <= 4), ("bad credit count %d",
 			m0->m_pkthdr.csum_data));
 		m0->m_type = MT_DONTFREE;
 		enqueue_wr(toep, m0);
 		DPRINTF("sending offload tx with %d bytes in %d segments\n",
 		    bytes, count);
 		l2t_send(cdev, m0, toep->tp_l2t);
 	}
 	sockbuf_unlock(snd);
 	return (total_bytes);
 }
 
 /*
  * Close a connection by sending a CPL_CLOSE_CON_REQ message.  Cannot fail
  * under any circumstances.  We take the easy way out and always queue the
  * message to the write_queue.  We can optimize the case where the queue is
  * already empty though the optimization is probably not worth it.
  */
 static void
 close_conn(struct socket *so)
 {
 	struct mbuf *m;
 	struct cpl_close_con_req *req;
 	struct tom_data *d;
 	struct inpcb *inp = so_sotoinpcb(so);
 	struct tcpcb *tp;
 	struct toepcb *toep;
 	unsigned int tid; 
 
 
 	inp_wlock(inp);
 	tp = so_sototcpcb(so);
 	toep = tp->t_toe;
 	
 	if (tp->t_state != TCPS_SYN_SENT)
 		t3_push_frames(so, 1);
 	
 	if (toep->tp_flags & TP_FIN_SENT) {
 		inp_wunlock(inp);
 		return;
 	}
 
 	tid = toep->tp_tid;
 	    
 	d = TOM_DATA(toep->tp_toedev);
 	
 	m = m_gethdr_nofail(sizeof(*req));
 	m_set_priority(m, CPL_PRIORITY_DATA);
 	m_set_sgl(m, NULL);
 	m_set_sgllen(m, 0);
 
 	toep->tp_flags |= TP_FIN_SENT;
 	req = mtod(m, struct cpl_close_con_req *);
 	
 	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_CLOSE_CON));
 	req->wr.wr_lo = htonl(V_WR_TID(tid));
 	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, tid));
 	req->rsvd = 0;
 	inp_wunlock(inp);
 	/*
 	 * XXX - need to defer shutdown while there is still data in the queue
 	 *
 	 */
 	CTR4(KTR_TOM, "%s CLOSE_CON_REQ so %p tp %p tid=%u", __FUNCTION__, so, tp, tid);
 	cxgb_ofld_send(d->cdev, m);
 
 }
 
 /*
  * Handle an ARP failure for a CPL_ABORT_REQ.  Change it into a no RST variant
  * and send it along.
  */
 static void
 abort_arp_failure(struct t3cdev *cdev, struct mbuf *m)
 {
 	struct cpl_abort_req *req = cplhdr(m);
 
 	req->cmd = CPL_ABORT_NO_RST;
 	cxgb_ofld_send(cdev, m);
 }
 
 /*
  * Send RX credits through an RX_DATA_ACK CPL message.  If nofail is 0 we are
  * permitted to return without sending the message in case we cannot allocate
  * an sk_buff.  Returns the number of credits sent.
  */
 uint32_t
 t3_send_rx_credits(struct tcpcb *tp, uint32_t credits, uint32_t dack, int nofail)
 {
 	struct mbuf *m;
 	struct cpl_rx_data_ack *req;
 	struct toepcb *toep = tp->t_toe;
 	struct toedev *tdev = toep->tp_toedev;
 	
 	m = m_gethdr_nofail(sizeof(*req));
 
 	DPRINTF("returning %u credits to HW\n", credits);
 	
 	req = mtod(m, struct cpl_rx_data_ack *);
 	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
 	req->wr.wr_lo = 0;
 	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tp_tid));
 	req->credit_dack = htonl(dack | V_RX_CREDITS(credits));
 	m_set_priority(m, mkprio(CPL_PRIORITY_ACK, toep)); 
 	cxgb_ofld_send(TOM_DATA(tdev)->cdev, m);
 	return (credits);
 }
 
 /*
  * Send RX_DATA_ACK CPL message to request a modulation timer to be scheduled.
  * This is only used in DDP mode, so we take the opportunity to also set the
  * DACK mode and flush any Rx credits.
  */
 void
 t3_send_rx_modulate(struct toepcb *toep)
 {
 	struct mbuf *m;
 	struct cpl_rx_data_ack *req;
 
 	m = m_gethdr_nofail(sizeof(*req));
 
 	req = mtod(m, struct cpl_rx_data_ack *);
 	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
 	req->wr.wr_lo = 0;
 	m->m_pkthdr.len = m->m_len = sizeof(*req);
 	
 	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tp_tid));
 	req->credit_dack = htonl(F_RX_MODULATE | F_RX_DACK_CHANGE |
 				 V_RX_DACK_MODE(1) |
 				 V_RX_CREDITS(toep->tp_copied_seq - toep->tp_rcv_wup));
 	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
 	cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
 	toep->tp_rcv_wup = toep->tp_copied_seq;
 }
 
 /*
  * Handle receipt of an urgent pointer.
  */
 static void
 handle_urg_ptr(struct socket *so, uint32_t urg_seq)
 {
 #ifdef URGENT_DATA_SUPPORTED
 	struct tcpcb *tp = so_sototcpcb(so);
 
 	urg_seq--;   /* initially points past the urgent data, per BSD */
 
 	if (tp->urg_data && !after(urg_seq, tp->urg_seq))
 		return;                                 /* duplicate pointer */
 	sk_send_sigurg(sk);
 	if (tp->urg_seq == tp->copied_seq && tp->urg_data &&
 	    !sock_flag(sk, SOCK_URGINLINE) && tp->copied_seq != tp->rcv_nxt) {
 		struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
 
 		tp->copied_seq++;
 		if (skb && tp->copied_seq - TCP_SKB_CB(skb)->seq >= skb->len)
 			tom_eat_skb(sk, skb, 0);
 	}
 	tp->urg_data = TCP_URG_NOTYET;
 	tp->urg_seq = urg_seq;
 #endif
 }
 
 /*
  * Returns true if a socket cannot accept new Rx data.
  */
 static inline int
 so_no_receive(const struct socket *so)
 {
 	return (so_state_get(so) & (SS_ISDISCONNECTED|SS_ISDISCONNECTING));
 }
 
 /*
  * Process an urgent data notification.
  */
 static void
 rx_urg_notify(struct toepcb *toep, struct mbuf *m)
 {
 	struct cpl_rx_urg_notify *hdr = cplhdr(m);
 	struct socket *so = inp_inpcbtosocket(toep->tp_tp->t_inpcb);
 
 	VALIDATE_SOCK(so);
 
 	if (!so_no_receive(so))
 		handle_urg_ptr(so, ntohl(hdr->seq));
 
 	m_freem(m);
 }
 
 /*
  * Handler for RX_URG_NOTIFY CPL messages.
  */
 static int
 do_rx_urg_notify(struct t3cdev *cdev, struct mbuf *m, void *ctx)
 {
 	struct toepcb *toep = (struct toepcb *)ctx;
 
 	rx_urg_notify(toep, m);
 	return (0);
 }
 
 static __inline int
 is_delack_mode_valid(struct toedev *dev, struct toepcb *toep)
 {
 	return (toep->tp_ulp_mode ||
 		(toep->tp_ulp_mode == ULP_MODE_TCPDDP &&
 		    dev->tod_ttid >= TOE_ID_CHELSIO_T3));
 }
 
 /*
  * Set of states for which we should return RX credits.
  */
 #define CREDIT_RETURN_STATE (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2)
 
 /*
  * Called after some received data has been read.  It returns RX credits
  * to the HW for the amount of data processed.
  */
 void
 t3_cleanup_rbuf(struct tcpcb *tp, int copied)
 {
 	struct toepcb *toep = tp->t_toe;
 	struct socket *so;
 	struct toedev *dev;
 	int dack_mode, must_send, read;
 	u32 thres, credits, dack = 0;
 	struct sockbuf *rcv;
 	
 	so = inp_inpcbtosocket(tp->t_inpcb);
 	rcv = so_sockbuf_rcv(so);
 
 	if (!((tp->t_state == TCPS_ESTABLISHED) || (tp->t_state == TCPS_FIN_WAIT_1) ||
 		(tp->t_state == TCPS_FIN_WAIT_2))) {
 		if (copied) {
 			sockbuf_lock(rcv);
 			toep->tp_copied_seq += copied;
 			sockbuf_unlock(rcv);
 		}
 		
 		return;
 	}
 	
 	inp_lock_assert(tp->t_inpcb); 
 
 	sockbuf_lock(rcv);
 	if (copied)
 		toep->tp_copied_seq += copied;
 	else {
 		read = toep->tp_enqueued_bytes - rcv->sb_cc;
 		toep->tp_copied_seq += read;
 	}
 	credits = toep->tp_copied_seq - toep->tp_rcv_wup;
 	toep->tp_enqueued_bytes = rcv->sb_cc;
 	sockbuf_unlock(rcv);
 
 	if (credits > rcv->sb_mbmax) {
 		log(LOG_ERR, "copied_seq=%u rcv_wup=%u credits=%u\n",
 		    toep->tp_copied_seq, toep->tp_rcv_wup, credits);
 	    credits = rcv->sb_mbmax;
 	}
 	
 	    
 	/*
 	 * XXX this won't accurately reflect credit return - we need
 	 * to look at the difference between the amount that has been 
 	 * put in the recv sockbuf and what is there now
 	 */
 
 	if (__predict_false(!credits))
 		return;
 
 	dev = toep->tp_toedev;
 	thres = TOM_TUNABLE(dev, rx_credit_thres);
 
 	if (__predict_false(thres == 0))
 		return;
 
 	if (is_delack_mode_valid(dev, toep)) {
 		dack_mode = TOM_TUNABLE(dev, delack);
 		if (__predict_false(dack_mode != toep->tp_delack_mode)) {
 			u32 r = tp->rcv_nxt - toep->tp_delack_seq;
 
 			if (r >= tp->rcv_wnd || r >= 16 * toep->tp_mss_clamp)
 				dack = F_RX_DACK_CHANGE |
 				       V_RX_DACK_MODE(dack_mode);
 		}
 	} else 
 		dack = F_RX_DACK_CHANGE | V_RX_DACK_MODE(1);
 		
 	/*
 	 * For coalescing to work effectively ensure the receive window has
 	 * at least 16KB left.
 	 */
 	must_send = credits + 16384 >= tp->rcv_wnd;
 
 	if (must_send || credits >= thres)
 		toep->tp_rcv_wup += t3_send_rx_credits(tp, credits, dack, must_send);
 }
 
 static int
 cxgb_toe_disconnect(struct tcpcb *tp)
 {
 	struct socket *so;
 	
 	DPRINTF("cxgb_toe_disconnect\n");
 
 	so = inp_inpcbtosocket(tp->t_inpcb);
 	close_conn(so);
 	return (0);
 }
 
 static int
 cxgb_toe_reset(struct tcpcb *tp)
 {
 	struct toepcb *toep = tp->t_toe;
 
 	t3_send_reset(toep);
 
 	/*
 	 * unhook from socket
 	 */
 	tp->t_flags &= ~TF_TOE;
 	toep->tp_tp = NULL;
 	tp->t_toe = NULL;
 	return (0);
 }
 
 static int
 cxgb_toe_send(struct tcpcb *tp)
 {
 	struct socket *so;
 	
 	DPRINTF("cxgb_toe_send\n");
 	dump_toepcb(tp->t_toe);
 
 	so = inp_inpcbtosocket(tp->t_inpcb);
 	t3_push_frames(so, 1);
 	return (0);
 }
 
 static int
 cxgb_toe_rcvd(struct tcpcb *tp)
 {
 
 	inp_lock_assert(tp->t_inpcb);
 
 	t3_cleanup_rbuf(tp, 0);
 	
 	return (0);
 }
 
 static void
 cxgb_toe_detach(struct tcpcb *tp)
 {
 	struct toepcb *toep;
 
         /*
 	 * XXX how do we handle teardown in the SYN_SENT state?
 	 *
 	 */
 	inp_lock_assert(tp->t_inpcb);
 	toep = tp->t_toe;
 	toep->tp_tp = NULL;
 
 	/*
 	 * unhook from socket
 	 */
 	tp->t_flags &= ~TF_TOE;
 	tp->t_toe = NULL;
 }
 	
 
 static struct toe_usrreqs cxgb_toe_usrreqs = {
 	.tu_disconnect = cxgb_toe_disconnect,
 	.tu_reset = cxgb_toe_reset,
 	.tu_send = cxgb_toe_send,
 	.tu_rcvd = cxgb_toe_rcvd,
 	.tu_detach = cxgb_toe_detach,
 	.tu_detach = cxgb_toe_detach,
 	.tu_syncache_event = handle_syncache_event,
 };
 
 
 static void
 __set_tcb_field(struct toepcb *toep, struct mbuf *m, uint16_t word,
 			    uint64_t mask, uint64_t val, int no_reply)
 {
 	struct cpl_set_tcb_field *req;
 
 	CTR4(KTR_TCB, "__set_tcb_field_ulp(tid=%u word=0x%x mask=%jx val=%jx",
 	    toep->tp_tid, word, mask, val);
 
 	req = mtod(m, struct cpl_set_tcb_field *);
 	m->m_pkthdr.len = m->m_len = sizeof(*req);
 	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
 	req->wr.wr_lo = 0;
 	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, toep->tp_tid));
 	req->reply = V_NO_REPLY(no_reply);
 	req->cpu_idx = 0;
 	req->word = htons(word);
 	req->mask = htobe64(mask);
 	req->val = htobe64(val);
 
 	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
 	send_or_defer(toep, m, 0);
 }
 
 static void
 t3_set_tcb_field(struct toepcb *toep, uint16_t word, uint64_t mask, uint64_t val)
 {
 	struct mbuf *m;
 	struct tcpcb *tp = toep->tp_tp;
 	
 	if (toep == NULL)
 		return;
  
 	if (tp->t_state == TCPS_CLOSED || (toep->tp_flags & TP_ABORT_SHUTDOWN)) {
 		printf("not seting field\n");
 		return;
 	}
 	
 	m = m_gethdr_nofail(sizeof(struct cpl_set_tcb_field));
 
 	__set_tcb_field(toep, m, word, mask, val, 1);
 }
 
 /*
  * Set one of the t_flags bits in the TCB.
  */
 static void
 set_tcb_tflag(struct toepcb *toep, unsigned int bit_pos, int val)
 {
 
 	t3_set_tcb_field(toep, W_TCB_T_FLAGS1, 1ULL << bit_pos, val << bit_pos);
 }
 
 /*
  * Send a SET_TCB_FIELD CPL message to change a connection's Nagle setting.
  */
 static void
 t3_set_nagle(struct toepcb *toep)
 {
 	struct tcpcb *tp = toep->tp_tp;
 	
 	set_tcb_tflag(toep, S_TF_NAGLE, !(tp->t_flags & TF_NODELAY));
 }
 
 /*
  * Send a SET_TCB_FIELD CPL message to change a connection's keepalive setting.
  */
 void
 t3_set_keepalive(struct toepcb *toep, int on_off)
 {
 
 	set_tcb_tflag(toep, S_TF_KEEPALIVE, on_off);
 }
 
 void
 t3_set_rcv_coalesce_enable(struct toepcb *toep, int on_off)
 {
 	set_tcb_tflag(toep, S_TF_RCV_COALESCE_ENABLE, on_off);
 }
 
 void
 t3_set_dack_mss(struct toepcb *toep, int on_off)
 {
 
 	set_tcb_tflag(toep, S_TF_DACK_MSS, on_off);
 }
 
 /*
  * Send a SET_TCB_FIELD CPL message to change a connection's TOS setting.
  */
 static void
 t3_set_tos(struct toepcb *toep)
 {
 	int tos = inp_ip_tos_get(toep->tp_tp->t_inpcb);	
 	
 	t3_set_tcb_field(toep, W_TCB_TOS, V_TCB_TOS(M_TCB_TOS),
 			 V_TCB_TOS(tos));
 }
 
 
 /*
  * In DDP mode, TP fails to schedule a timer to push RX data to the host when
  * DDP is disabled (data is delivered to freelist). [Note that, the peer should
  * set the PSH bit in the last segment, which would trigger delivery.]
  * We work around the issue by setting a DDP buffer in a partial placed state,
  * which guarantees that TP will schedule a timer.
  */
 #define TP_DDP_TIMER_WORKAROUND_MASK\
     (V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(1) |\
      ((V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |\
        V_TCB_RX_DDP_BUF0_LEN(3)) << 32))
 #define TP_DDP_TIMER_WORKAROUND_VAL\
     (V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(0) |\
      ((V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)1) | V_TCB_RX_DDP_BUF0_LEN((uint64_t)2)) <<\
       32))
 
 static void
 t3_enable_ddp(struct toepcb *toep, int on)
 {
 	if (on) {
 		
 		t3_set_tcb_field(toep, W_TCB_RX_DDP_FLAGS, V_TF_DDP_OFF(1),
 				 V_TF_DDP_OFF(0));
 	} else
 		t3_set_tcb_field(toep, W_TCB_RX_DDP_FLAGS,
 				 V_TF_DDP_OFF(1) |
 				 TP_DDP_TIMER_WORKAROUND_MASK,
 				 V_TF_DDP_OFF(1) |
 				 TP_DDP_TIMER_WORKAROUND_VAL);
 
 }
 
 void
 t3_set_ddp_tag(struct toepcb *toep, int buf_idx, unsigned int tag_color)
 {
 	t3_set_tcb_field(toep, W_TCB_RX_DDP_BUF0_TAG + buf_idx,
 			 V_TCB_RX_DDP_BUF0_TAG(M_TCB_RX_DDP_BUF0_TAG),
 			 tag_color);
 }
 
 void
 t3_set_ddp_buf(struct toepcb *toep, int buf_idx, unsigned int offset,
 		    unsigned int len)
 {
 	if (buf_idx == 0)
 		t3_set_tcb_field(toep, W_TCB_RX_DDP_BUF0_OFFSET,
 			 V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |
 			 V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN),
 			 V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)offset) |
 			 V_TCB_RX_DDP_BUF0_LEN((uint64_t)len));
 	else
 		t3_set_tcb_field(toep, W_TCB_RX_DDP_BUF1_OFFSET,
 			 V_TCB_RX_DDP_BUF1_OFFSET(M_TCB_RX_DDP_BUF1_OFFSET) |
 			 V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN << 32),
 			 V_TCB_RX_DDP_BUF1_OFFSET((uint64_t)offset) |
 			 V_TCB_RX_DDP_BUF1_LEN(((uint64_t)len) << 32));
 }
 
 static int
 t3_set_cong_control(struct socket *so, const char *name)
 {
 #ifdef CONGESTION_CONTROL_SUPPORTED	
 	int cong_algo;
 
 	for (cong_algo = 0; cong_algo < ARRAY_SIZE(t3_cong_ops); cong_algo++)
 		if (!strcmp(name, t3_cong_ops[cong_algo].name))
 			break;
 
 	if (cong_algo >= ARRAY_SIZE(t3_cong_ops))
 		return -EINVAL;
 #endif
 	return 0;
 }
 
 int
 t3_get_tcb(struct toepcb *toep)
 {
 	struct cpl_get_tcb *req;
 	struct tcpcb *tp = toep->tp_tp;
 	struct mbuf *m = m_gethdr(M_NOWAIT, MT_DATA);
 
 	if (!m)
 		return (ENOMEM);
 	
 	inp_lock_assert(tp->t_inpcb);	
 	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
 	req = mtod(m, struct cpl_get_tcb *);
 	m->m_pkthdr.len = m->m_len = sizeof(*req);
 	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
 	req->wr.wr_lo = 0;
 	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_GET_TCB, toep->tp_tid));
 	req->cpuno = htons(toep->tp_qset);
 	req->rsvd = 0;
 	if (tp->t_state == TCPS_SYN_SENT)
 		mbufq_tail(&toep->out_of_order_queue, m);	// defer
 	else
 		cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
 	return 0;
 }
 
 static inline void
 so_insert_tid(struct tom_data *d, struct toepcb *toep, unsigned int tid)
 {
 
 	toepcb_hold(toep);
 
 	cxgb_insert_tid(d->cdev, d->client, toep, tid);
 }
 
 /**
  *	find_best_mtu - find the entry in the MTU table closest to an MTU
  *	@d: TOM state
  *	@mtu: the target MTU
  *
  *	Returns the index of the value in the MTU table that is closest to but
  *	does not exceed the target MTU.
  */
 static unsigned int
 find_best_mtu(const struct t3c_data *d, unsigned short mtu)
 {
 	int i = 0;
 
 	while (i < d->nmtus - 1 && d->mtus[i + 1] <= mtu)
 		++i;
 	return (i);
 }
 
 static unsigned int
 select_mss(struct t3c_data *td, struct tcpcb *tp, unsigned int pmtu)
 {
 	unsigned int idx;
 	
 #ifdef notyet
 	struct rtentry *dst = so_sotoinpcb(so)->inp_route.ro_rt;
 #endif
 	if (tp) {
 		tp->t_maxseg = pmtu - 40;
 		if (tp->t_maxseg < td->mtus[0] - 40)
 			tp->t_maxseg = td->mtus[0] - 40;
 		idx = find_best_mtu(td, tp->t_maxseg + 40);
 
 		tp->t_maxseg = td->mtus[idx] - 40;
 	} else
 		idx = find_best_mtu(td, pmtu);
 	
 	return (idx);
 }
 
 static inline void
 free_atid(struct t3cdev *cdev, unsigned int tid)
 {
 	struct toepcb *toep = cxgb_free_atid(cdev, tid);
 
 	if (toep)
 		toepcb_release(toep);
 }
 
 /*
  * Release resources held by an offload connection (TID, L2T entry, etc.)
  */
 static void
 t3_release_offload_resources(struct toepcb *toep)
 {
 	struct tcpcb *tp = toep->tp_tp;
 	struct toedev *tdev = toep->tp_toedev;
 	struct t3cdev *cdev;
 	struct socket *so;
 	unsigned int tid = toep->tp_tid;
 	struct sockbuf *rcv;
 	
 	CTR0(KTR_TOM, "t3_release_offload_resources");
 
 	if (!tdev)
 		return;
 
 	cdev = TOEP_T3C_DEV(toep);
 	if (!cdev)
 		return;
 
 	toep->tp_qset = 0;
 	t3_release_ddp_resources(toep);
 
 #ifdef CTRL_SKB_CACHE
 	kfree_skb(CTRL_SKB_CACHE(tp));
 	CTRL_SKB_CACHE(tp) = NULL;
 #endif
 
 	if (toep->tp_wr_avail != toep->tp_wr_max) {
 		purge_wr_queue(toep);
 		reset_wr_list(toep);
 	}
 
 	if (toep->tp_l2t) {
 		l2t_release(L2DATA(cdev), toep->tp_l2t);
 		toep->tp_l2t = NULL;
 	}
 	toep->tp_tp = NULL;
 	if (tp) {
 		inp_lock_assert(tp->t_inpcb);
 		so = inp_inpcbtosocket(tp->t_inpcb);
 		rcv = so_sockbuf_rcv(so);		
 		/*
 		 * cancel any offloaded reads
 		 *
 		 */
 		sockbuf_lock(rcv);
 		tp->t_toe = NULL;
 		tp->t_flags &= ~TF_TOE;
 		if (toep->tp_ddp_state.user_ddp_pending) {
 			t3_cancel_ubuf(toep, rcv);
 			toep->tp_ddp_state.user_ddp_pending = 0;
 		}
 		so_sorwakeup_locked(so);
 			
 	}
 	
 	if (toep->tp_state == TCPS_SYN_SENT) {
 		free_atid(cdev, tid);
 #ifdef notyet		
 		__skb_queue_purge(&tp->out_of_order_queue);
 #endif		
 	} else {                                          // we have TID
 		cxgb_remove_tid(cdev, toep, tid);
 		toepcb_release(toep);
 	}
 #if 0
 	log(LOG_INFO, "closing TID %u, state %u\n", tid, tp->t_state);
 #endif
 }
 
 static void
 install_offload_ops(struct socket *so)
 {
 	struct tcpcb *tp = so_sototcpcb(so);
 
 	KASSERT(tp->t_toe != NULL, ("toepcb not set"));
 	
 	t3_install_socket_ops(so);
 	tp->t_flags |= TF_TOE;
 	tp->t_tu = &cxgb_toe_usrreqs;
 }
 
 /*
  * Determine the receive window scaling factor given a target max
  * receive window.
  */
 static __inline int
 select_rcv_wscale(int space)
 {
 	INIT_VNET_INET(so->so_vnet);
 	int wscale = 0;
 
 	if (space > MAX_RCV_WND)
 		space = MAX_RCV_WND;
 
 	if (V_tcp_do_rfc1323)
 		for (; space > 65535 && wscale < 14; space >>= 1, ++wscale) ;
 
 	return (wscale);
 }
 
 /*
  * Determine the receive window size for a socket.
  */
 static unsigned long
 select_rcv_wnd(struct toedev *dev, struct socket *so)
 {
 	INIT_VNET_INET(so->so_vnet);
 	struct tom_data *d = TOM_DATA(dev);
 	unsigned int wnd;
 	unsigned int max_rcv_wnd;
 	struct sockbuf *rcv;
 
 	rcv = so_sockbuf_rcv(so);
 	
 	if (V_tcp_do_autorcvbuf)
 		wnd = V_tcp_autorcvbuf_max;
 	else
 		wnd = rcv->sb_hiwat;
 
 	
 	
 	/* XXX
 	 * For receive coalescing to work effectively we need a receive window
 	 * that can accomodate a coalesced segment.
 	 */	
 	if (wnd < MIN_RCV_WND)
 		wnd = MIN_RCV_WND; 
 	
 	/* PR 5138 */
 	max_rcv_wnd = (dev->tod_ttid < TOE_ID_CHELSIO_T3C ? 
 				    (uint32_t)d->rx_page_size * 23 :
 				    MAX_RCV_WND);
 	
 	return min(wnd, max_rcv_wnd);
 }
 
 /*
  * Assign offload parameters to some socket fields.  This code is used by
  * both active and passive opens.
  */
 static inline void
 init_offload_socket(struct socket *so, struct toedev *dev, unsigned int tid,
     struct l2t_entry *e, struct rtentry *dst, struct toepcb *toep)
 {
 	struct tcpcb *tp = so_sototcpcb(so);
 	struct t3c_data *td = T3C_DATA(TOM_DATA(dev)->cdev);
 	struct sockbuf *snd, *rcv;
 	
 #ifdef notyet	
 	SOCK_LOCK_ASSERT(so);
 #endif
 	
 	snd = so_sockbuf_snd(so);
 	rcv = so_sockbuf_rcv(so);
 	
 	log(LOG_INFO, "initializing offload socket\n");
 	/*
 	 * We either need to fix push frames to work with sbcompress
 	 * or we need to add this
 	 */
 	snd->sb_flags |= SB_NOCOALESCE;
 	rcv->sb_flags |= SB_NOCOALESCE;
 	
 	tp->t_toe = toep;
 	toep->tp_tp = tp;
 	toep->tp_toedev = dev;
 	
 	toep->tp_tid = tid;
 	toep->tp_l2t = e;
 	toep->tp_wr_max = toep->tp_wr_avail = TOM_TUNABLE(dev, max_wrs);
 	toep->tp_wr_unacked = 0;
 	toep->tp_delack_mode = 0;
 	
 	toep->tp_mtu_idx = select_mss(td, tp, dst->rt_ifp->if_mtu);
 	/*
 	 * XXX broken
 	 * 
 	 */
 	tp->rcv_wnd = select_rcv_wnd(dev, so);
 
         toep->tp_ulp_mode = TOM_TUNABLE(dev, ddp) && !(so_options_get(so) & SO_NO_DDP) &&
 		       tp->rcv_wnd >= MIN_DDP_RCV_WIN ? ULP_MODE_TCPDDP : 0;
 	toep->tp_qset_idx = 0;
 	
 	reset_wr_list(toep);
 	DPRINTF("initialization done\n");
 }
 
 /*
  * The next two functions calculate the option 0 value for a socket.
  */
 static inline unsigned int
 calc_opt0h(struct socket *so, int mtu_idx)
 {
 	struct tcpcb *tp = so_sototcpcb(so);
 	int wscale = select_rcv_wscale(tp->rcv_wnd);
 	
 	return V_NAGLE((tp->t_flags & TF_NODELAY) == 0) |
 	    V_KEEP_ALIVE((so_options_get(so) & SO_KEEPALIVE) != 0) | F_TCAM_BYPASS |
 	    V_WND_SCALE(wscale) | V_MSS_IDX(mtu_idx);
 }
 
 static inline unsigned int
 calc_opt0l(struct socket *so, int ulp_mode)
 {
 	struct tcpcb *tp = so_sototcpcb(so);
 	unsigned int val;
 	
 	val = V_TOS(INP_TOS(tp->t_inpcb)) | V_ULP_MODE(ulp_mode) |
 	       V_RCV_BUFSIZ(min(tp->rcv_wnd >> 10, (u32)M_RCV_BUFSIZ));
 
 	DPRINTF("opt0l tos=%08x rcv_wnd=%ld opt0l=%08x\n", INP_TOS(tp->t_inpcb), tp->rcv_wnd, val);
 	return (val);
 }
 
 static inline unsigned int
 calc_opt2(const struct socket *so, struct toedev *dev)
 {
 	int flv_valid;
 
 	flv_valid = (TOM_TUNABLE(dev, cong_alg) != -1);
 
 	return (V_FLAVORS_VALID(flv_valid) |
 	    V_CONG_CONTROL_FLAVOR(flv_valid ? TOM_TUNABLE(dev, cong_alg) : 0));
 }
 
 #if DEBUG_WR > 1
 static int
 count_pending_wrs(const struct toepcb *toep)
 {
 	const struct mbuf *m;
 	int n = 0;
 
 	wr_queue_walk(toep, m)
 		n += m->m_pkthdr.csum_data;
 	return (n);
 }
 #endif
 
 #if 0
 (((*(struct tom_data **)&(dev)->l4opt)->conf.cong_alg) != -1)
 #endif
 	
 static void
 mk_act_open_req(struct socket *so, struct mbuf *m,
     unsigned int atid, const struct l2t_entry *e)
 {
 	struct cpl_act_open_req *req;
 	struct inpcb *inp = so_sotoinpcb(so);
 	struct tcpcb *tp = inp_inpcbtotcpcb(inp);
 	struct toepcb *toep = tp->t_toe;
 	struct toedev *tdev = toep->tp_toedev;
 	
 	m_set_priority((struct mbuf *)m, mkprio(CPL_PRIORITY_SETUP, toep));
 	
 	req = mtod(m, struct cpl_act_open_req *);
 	m->m_pkthdr.len = m->m_len = sizeof(*req);
 
 	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
 	req->wr.wr_lo = 0;
 	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ACT_OPEN_REQ, atid));
 	inp_4tuple_get(inp, &req->local_ip, &req->local_port, &req->peer_ip, &req->peer_port);
 #if 0	
 	req->local_port = inp->inp_lport;
 	req->peer_port = inp->inp_fport;
 	memcpy(&req->local_ip, &inp->inp_laddr, 4);
 	memcpy(&req->peer_ip, &inp->inp_faddr, 4);
 #endif	
 	req->opt0h = htonl(calc_opt0h(so, toep->tp_mtu_idx) | V_L2T_IDX(e->idx) |
 			   V_TX_CHANNEL(e->smt_idx));
 	req->opt0l = htonl(calc_opt0l(so, toep->tp_ulp_mode));
 	req->params = 0;
 	req->opt2 = htonl(calc_opt2(so, tdev));
 }
 
 
 /*
  * Convert an ACT_OPEN_RPL status to an errno.
  */
 static int
 act_open_rpl_status_to_errno(int status)
 {
 	switch (status) {
 	case CPL_ERR_CONN_RESET:
 		return (ECONNREFUSED);
 	case CPL_ERR_ARP_MISS:
 		return (EHOSTUNREACH);
 	case CPL_ERR_CONN_TIMEDOUT:
 		return (ETIMEDOUT);
 	case CPL_ERR_TCAM_FULL:
 		return (ENOMEM);
 	case CPL_ERR_CONN_EXIST:
 		log(LOG_ERR, "ACTIVE_OPEN_RPL: 4-tuple in use\n");
 		return (EADDRINUSE);
 	default:
 		return (EIO);
 	}
 }
 
 static void
 fail_act_open(struct toepcb *toep, int errno)
 {
 	struct tcpcb *tp = toep->tp_tp;
 
 	t3_release_offload_resources(toep);
 	if (tp) {
 		inp_wunlock(tp->t_inpcb);		
 		tcp_offload_drop(tp, errno);
 	}
 	
 #ifdef notyet
 	TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
 #endif
 }
 
 /*
  * Handle active open failures.
  */
 static void
 active_open_failed(struct toepcb *toep, struct mbuf *m)
 {
 	struct cpl_act_open_rpl *rpl = cplhdr(m);
 	struct inpcb *inp;
 
 	if (toep->tp_tp == NULL)
 		goto done;
 
 	inp = toep->tp_tp->t_inpcb;
 
 /*
  * Don't handle connection retry for now
  */
 #ifdef notyet
 	struct inet_connection_sock *icsk = inet_csk(sk);
 
 	if (rpl->status == CPL_ERR_CONN_EXIST &&
 	    icsk->icsk_retransmit_timer.function != act_open_retry_timer) {
 		icsk->icsk_retransmit_timer.function = act_open_retry_timer;
 		sk_reset_timer(so, &icsk->icsk_retransmit_timer,
 			       jiffies + HZ / 2);
 	} else
 #endif
 	{
 		inp_wlock(inp);
 		/*
 		 * drops the inpcb lock
 		 */
 		fail_act_open(toep, act_open_rpl_status_to_errno(rpl->status));
 	}
 	
 	done:
 	m_free(m);
 }
 
 /*
  * Return whether a failed active open has allocated a TID
  */
 static inline int
 act_open_has_tid(int status)
 {
 	return status != CPL_ERR_TCAM_FULL && status != CPL_ERR_CONN_EXIST &&
 	       status != CPL_ERR_ARP_MISS;
 }
 
 /*
  * Process an ACT_OPEN_RPL CPL message.
  */
 static int
 do_act_open_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx)
 {
 	struct toepcb *toep = (struct toepcb *)ctx;
 	struct cpl_act_open_rpl *rpl = cplhdr(m);
 	
 	if (cdev->type != T3A && act_open_has_tid(rpl->status))
 		cxgb_queue_tid_release(cdev, GET_TID(rpl));
 	
 	active_open_failed(toep, m);
 	return (0);
 }
 
 /*
  * Handle an ARP failure for an active open.   XXX purge ofo queue
  *
  * XXX badly broken for crossed SYNs as the ATID is no longer valid.
  * XXX crossed SYN errors should be generated by PASS_ACCEPT_RPL which should
  * check SOCK_DEAD or sk->sk_sock.  Or maybe generate the error here but don't
  * free the atid.  Hmm.
  */
 #ifdef notyet
 static void
 act_open_req_arp_failure(struct t3cdev *dev, struct mbuf *m)
 {
 	struct toepcb *toep = m_get_toep(m);
 	struct tcpcb *tp = toep->tp_tp;
 	struct inpcb *inp = tp->t_inpcb;
 	struct socket *so;
 	
 	inp_wlock(inp);
 	if (tp->t_state == TCPS_SYN_SENT || tp->t_state == TCPS_SYN_RECEIVED) {
 		/*
 		 * drops the inpcb lock
 		 */
 		fail_act_open(so, EHOSTUNREACH);
 		printf("freeing %p\n", m);
 		
 		m_free(m);
 	} else
 		inp_wunlock(inp);
 }
 #endif
 /*
  * Send an active open request.
  */
 int
 t3_connect(struct toedev *tdev, struct socket *so,
     struct rtentry *rt, struct sockaddr *nam)
 {
 	struct mbuf *m;
 	struct l2t_entry *e;
 	struct tom_data *d = TOM_DATA(tdev);
 	struct inpcb *inp = so_sotoinpcb(so);
 	struct tcpcb *tp = intotcpcb(inp);
 	struct toepcb *toep; /* allocated by init_offload_socket */
 		
 	int atid;
 
 	toep = toepcb_alloc();
 	if (toep == NULL)
 		goto out_err;
 	
 	if ((atid = cxgb_alloc_atid(d->cdev, d->client, toep)) < 0)
 		goto out_err;
 	
 	e = t3_l2t_get(d->cdev, rt, rt->rt_ifp, nam);
 	if (!e)
 		goto free_tid;
 
 	inp_lock_assert(inp);
 	m = m_gethdr(MT_DATA, M_WAITOK);
 	
 #if 0	
 	m->m_toe.mt_toepcb = tp->t_toe;
 	set_arp_failure_handler((struct mbuf *)m, act_open_req_arp_failure);
 #endif
 	so_lock(so);
 	
 	init_offload_socket(so, tdev, atid, e, rt, toep);
 	
 	install_offload_ops(so);
 	
 	mk_act_open_req(so, m, atid, e);
 	so_unlock(so);
 	
 	soisconnecting(so);
 	toep = tp->t_toe;
 	m_set_toep(m, tp->t_toe);
 	
 	toep->tp_state = TCPS_SYN_SENT;
 	l2t_send(d->cdev, (struct mbuf *)m, e);
 
 	if (toep->tp_ulp_mode)
 		t3_enable_ddp(toep, 0);
 	return 	(0);
 	
 free_tid:
 	printf("failing connect - free atid\n");
 	
 	free_atid(d->cdev, atid);
 out_err:
 	printf("return ENOMEM\n");
        return (ENOMEM);
 }
 
 /*
  * Send an ABORT_REQ message.  Cannot fail.  This routine makes sure we do
  * not send multiple ABORT_REQs for the same connection and also that we do
  * not try to send a message after the connection has closed.  Returns 1 if
  * an ABORT_REQ wasn't generated after all, 0 otherwise.
  */
 static void
 t3_send_reset(struct toepcb *toep)
 {
 	
 	struct cpl_abort_req *req;
 	unsigned int tid = toep->tp_tid;
 	int mode = CPL_ABORT_SEND_RST;
 	struct tcpcb *tp = toep->tp_tp;
 	struct toedev *tdev = toep->tp_toedev;
 	struct socket *so = NULL;
 	struct mbuf *m;
 	struct sockbuf *snd;
 	
 	if (tp) {
 		inp_lock_assert(tp->t_inpcb);
 		so = inp_inpcbtosocket(tp->t_inpcb);
 	}
 	
 	if (__predict_false((toep->tp_flags & TP_ABORT_SHUTDOWN) ||
 		tdev == NULL))
 		return;
 	toep->tp_flags |= (TP_ABORT_RPL_PENDING|TP_ABORT_SHUTDOWN);
 
 	snd = so_sockbuf_snd(so);
 	/* Purge the send queue so we don't send anything after an abort. */
 	if (so)
 		sbflush(snd);
 	if ((toep->tp_flags & TP_CLOSE_CON_REQUESTED) && is_t3a(tdev))
 		mode |= CPL_ABORT_POST_CLOSE_REQ;
 
 	m = m_gethdr_nofail(sizeof(*req));
 	m_set_priority(m, mkprio(CPL_PRIORITY_DATA, toep));
 	set_arp_failure_handler(m, abort_arp_failure);
 
 	req = mtod(m, struct cpl_abort_req *);
 	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_REQ));
 	req->wr.wr_lo = htonl(V_WR_TID(tid));
 	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ABORT_REQ, tid));
 	req->rsvd0 = tp ? htonl(tp->snd_nxt) : 0;
 	req->rsvd1 = !(toep->tp_flags & TP_DATASENT);
 	req->cmd = mode;
 	if (tp && (tp->t_state == TCPS_SYN_SENT))
 		mbufq_tail(&toep->out_of_order_queue, m);	// defer
 	else
 		l2t_send(TOEP_T3C_DEV(toep), m, toep->tp_l2t);
 }
 
 static int
 t3_ip_ctloutput(struct socket *so, struct sockopt *sopt)
 {
 	struct inpcb *inp;
 	int error, optval;
 	
 	if (sopt->sopt_name == IP_OPTIONS)
 		return (ENOPROTOOPT);
 
 	if (sopt->sopt_name != IP_TOS)
 		return (EOPNOTSUPP);
 	
 	error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval);
 
 	if (error)
 		return (error);
 
 	if (optval > IPTOS_PREC_CRITIC_ECP)
 		return (EINVAL);
 
 	inp = so_sotoinpcb(so);
 	inp_wlock(inp);
 	inp_ip_tos_set(inp, optval);
 #if 0	
 	inp->inp_ip_tos = optval;
 #endif
 	t3_set_tos(inp_inpcbtotcpcb(inp)->t_toe);
 	inp_wunlock(inp);
 
 	return (0);
 }
 
 static int
 t3_tcp_ctloutput(struct socket *so, struct sockopt *sopt)
 {
 	int err = 0;
 	size_t copied;
 
 	if (sopt->sopt_name != TCP_CONGESTION &&
 	    sopt->sopt_name != TCP_NODELAY)
 		return (EOPNOTSUPP);
 
 	if (sopt->sopt_name == TCP_CONGESTION) {
 		char name[TCP_CA_NAME_MAX];
 		int optlen = sopt->sopt_valsize;
 		struct tcpcb *tp;
 		
 		if (sopt->sopt_dir == SOPT_GET) {
 			KASSERT(0, ("unimplemented"));
 			return (EOPNOTSUPP);
 		}
 
 		if (optlen < 1)
 			return (EINVAL);
 		
 		err = copyinstr(sopt->sopt_val, name, 
 		    min(TCP_CA_NAME_MAX - 1, optlen), &copied);
 		if (err)
 			return (err);
 		if (copied < 1)
 			return (EINVAL);
 
 		tp = so_sototcpcb(so);
 		/*
 		 * XXX I need to revisit this
 		 */
 		if ((err = t3_set_cong_control(so, name)) == 0) {
 #ifdef CONGESTION_CONTROL_SUPPORTED
 			tp->t_cong_control = strdup(name, M_CXGB);
 #endif			
 		} else
 			return (err);
 	} else {
 		int optval, oldval;
 		struct inpcb *inp;
 		struct tcpcb *tp;
 
 		if (sopt->sopt_dir == SOPT_GET)
 			return (EOPNOTSUPP);
 	
 		err = sooptcopyin(sopt, &optval, sizeof optval,
 		    sizeof optval);
 
 		if (err)
 			return (err);
 
 		inp = so_sotoinpcb(so);
 		inp_wlock(inp);
 		tp = inp_inpcbtotcpcb(inp);
 
 		oldval = tp->t_flags;
 		if (optval)
 			tp->t_flags |= TF_NODELAY;
 		else
 			tp->t_flags &= ~TF_NODELAY;
 		inp_wunlock(inp);
 
 
 		if (oldval != tp->t_flags && (tp->t_toe != NULL))
 			t3_set_nagle(tp->t_toe);
 
 	}
 
 	return (0);
 }
 
 int
 t3_ctloutput(struct socket *so, struct sockopt *sopt)
 {
 	int err;
 
 	if (sopt->sopt_level != IPPROTO_TCP) 
 		err =  t3_ip_ctloutput(so, sopt);
 	else
 		err = t3_tcp_ctloutput(so, sopt);
 
 	if (err != EOPNOTSUPP)
 		return (err);
 
 	return (tcp_ctloutput(so, sopt));
 }
 
 /*
  * Returns true if we need to explicitly request RST when we receive new data
  * on an RX-closed connection.
  */
 static inline int
 need_rst_on_excess_rx(const struct toepcb *toep)
 {
 	return (1);
 }
 
 /*
  * Handles Rx data that arrives in a state where the socket isn't accepting
  * new data.
  */
 static void
 handle_excess_rx(struct toepcb *toep, struct mbuf *m)
 {
 	
 	if (need_rst_on_excess_rx(toep) &&
 	    !(toep->tp_flags & TP_ABORT_SHUTDOWN))
 		t3_send_reset(toep);
 	m_freem(m); 
 }
 
 /*
  * Process a get_tcb_rpl as a DDP completion (similar to RX_DDP_COMPLETE)
  * by getting the DDP offset from the TCB.
  */
 static void
 tcb_rpl_as_ddp_complete(struct toepcb *toep, struct mbuf *m)
 {
 	struct ddp_state *q = &toep->tp_ddp_state;
 	struct ddp_buf_state *bsp;
 	struct cpl_get_tcb_rpl *hdr;
 	unsigned int ddp_offset;
 	struct socket *so;
 	struct tcpcb *tp;
 	struct sockbuf *rcv;	
 	int state;
 	
 	uint64_t t;
 	__be64 *tcb;
 
 	tp = toep->tp_tp;
 	so = inp_inpcbtosocket(tp->t_inpcb);
 
 	inp_lock_assert(tp->t_inpcb);
 	rcv = so_sockbuf_rcv(so);
 	sockbuf_lock(rcv);	
 	
 	/* Note that we only accout for CPL_GET_TCB issued by the DDP code.
 	 * We really need a cookie in order to dispatch the RPLs.
 	 */
 	q->get_tcb_count--;
 
 	/* It is a possible that a previous CPL already invalidated UBUF DDP
 	 * and moved the cur_buf idx and hence no further processing of this
 	 * skb is required. However, the app might be sleeping on
 	 * !q->get_tcb_count and we need to wake it up.
 	 */
 	if (q->cancel_ubuf && !t3_ddp_ubuf_pending(toep)) {
 		int state = so_state_get(so);
 
 		m_freem(m);
 		if (__predict_true((state & SS_NOFDREF) == 0))
 			so_sorwakeup_locked(so);
 		else
 			sockbuf_unlock(rcv);
 
 		return;
 	}
 
 	bsp = &q->buf_state[q->cur_buf];
 	hdr = cplhdr(m);
 	tcb = (__be64 *)(hdr + 1);
 	if (q->cur_buf == 0) {
 		t = be64toh(tcb[(31 - W_TCB_RX_DDP_BUF0_OFFSET) / 2]);
 		ddp_offset = t >> (32 + S_TCB_RX_DDP_BUF0_OFFSET);
 	} else {
 		t = be64toh(tcb[(31 - W_TCB_RX_DDP_BUF1_OFFSET) / 2]);
 		ddp_offset = t >> S_TCB_RX_DDP_BUF1_OFFSET;
 	}
 	ddp_offset &= M_TCB_RX_DDP_BUF0_OFFSET;
 	m->m_cur_offset = bsp->cur_offset;
 	bsp->cur_offset = ddp_offset;
 	m->m_len = m->m_pkthdr.len = ddp_offset - m->m_cur_offset;
 
 	CTR5(KTR_TOM,
 	    "tcb_rpl_as_ddp_complete: idx=%d seq=0x%x hwbuf=%u ddp_offset=%u cur_offset=%u",
 	    q->cur_buf, tp->rcv_nxt, q->cur_buf, ddp_offset, m->m_cur_offset);
 	KASSERT(ddp_offset >= m->m_cur_offset,
 	    ("ddp_offset=%u less than cur_offset=%u",
 		ddp_offset, m->m_cur_offset));
 	
 #if 0
 {
 	unsigned int ddp_flags, rcv_nxt, rx_hdr_offset, buf_idx;
 
 	t = be64toh(tcb[(31 - W_TCB_RX_DDP_FLAGS) / 2]);
 	ddp_flags = (t >> S_TCB_RX_DDP_FLAGS) & M_TCB_RX_DDP_FLAGS;
 
         t = be64toh(tcb[(31 - W_TCB_RCV_NXT) / 2]);
         rcv_nxt = t >> S_TCB_RCV_NXT;
         rcv_nxt &= M_TCB_RCV_NXT;
 
         t = be64toh(tcb[(31 - W_TCB_RX_HDR_OFFSET) / 2]);
         rx_hdr_offset = t >> (32 + S_TCB_RX_HDR_OFFSET);
         rx_hdr_offset &= M_TCB_RX_HDR_OFFSET;
 
 	T3_TRACE2(TIDTB(sk),
 		  "tcb_rpl_as_ddp_complete: DDP FLAGS 0x%x dma up to 0x%x",
 		  ddp_flags, rcv_nxt - rx_hdr_offset);
 	T3_TRACE4(TB(q),
 		  "tcb_rpl_as_ddp_complete: rcvnxt 0x%x hwbuf %u cur_offset %u cancel %u",
 		  tp->rcv_nxt, q->cur_buf, bsp->cur_offset, q->cancel_ubuf);
 	T3_TRACE3(TB(q),
 		  "tcb_rpl_as_ddp_complete: TCB rcvnxt 0x%x hwbuf 0x%x ddp_offset %u",
 		  rcv_nxt - rx_hdr_offset, ddp_flags, ddp_offset);
 	T3_TRACE2(TB(q),
 		  "tcb_rpl_as_ddp_complete: flags0 0x%x flags1 0x%x",
 		 q->buf_state[0].flags, q->buf_state[1].flags);
 
 }
 #endif
 	if (__predict_false(so_no_receive(so) && m->m_pkthdr.len)) {
 		handle_excess_rx(toep, m);
 		return;
 	}
 
 #ifdef T3_TRACE
 	if ((int)m->m_pkthdr.len < 0) {
 		t3_ddp_error(so, "tcb_rpl_as_ddp_complete: neg len");
 	}
 #endif
 	if (bsp->flags & DDP_BF_NOCOPY) {
 #ifdef T3_TRACE
 		T3_TRACE0(TB(q),
 			  "tcb_rpl_as_ddp_complete: CANCEL UBUF");
 
 		if (!q->cancel_ubuf && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
 			printk("!cancel_ubuf");
 			t3_ddp_error(sk, "tcb_rpl_as_ddp_complete: !cancel_ubuf");
 		}
 #endif
 		m->m_ddp_flags = DDP_BF_PSH | DDP_BF_NOCOPY | 1;
 		bsp->flags &= ~(DDP_BF_NOCOPY|DDP_BF_NODATA);
 		q->cur_buf ^= 1;
 	} else if (bsp->flags & DDP_BF_NOFLIP) {
 
 		m->m_ddp_flags = 1;    /* always a kernel buffer */
 
 		/* now HW buffer carries a user buffer */
 		bsp->flags &= ~DDP_BF_NOFLIP;
 		bsp->flags |= DDP_BF_NOCOPY;
 
 		/* It is possible that the CPL_GET_TCB_RPL doesn't indicate
 		 * any new data in which case we're done. If in addition the
 		 * offset is 0, then there wasn't a completion for the kbuf
 		 * and we need to decrement the posted count.
 		 */
 		if (m->m_pkthdr.len == 0) {
 			if (ddp_offset == 0) {
 				q->kbuf_posted--;
 				bsp->flags |= DDP_BF_NODATA;
 			}
 			sockbuf_unlock(rcv);
 			m_free(m);
 			return;
 		}
 	} else {
 		sockbuf_unlock(rcv);
 
 		/* This reply is for a CPL_GET_TCB_RPL to cancel the UBUF DDP,
 		 * but it got here way late and nobody cares anymore.
 		 */
 		m_free(m);
 		return;
 	}
 
 	m->m_ddp_gl = (unsigned char *)bsp->gl;
 	m->m_flags |= M_DDP;
 	m->m_seq = tp->rcv_nxt;
 	tp->rcv_nxt += m->m_pkthdr.len;
 	tp->t_rcvtime = ticks;
 	CTR3(KTR_TOM, "tcb_rpl_as_ddp_complete: seq 0x%x hwbuf %u m->m_pktlen %u",
 		  m->m_seq, q->cur_buf, m->m_pkthdr.len);
 	if (m->m_pkthdr.len == 0) {
 		q->user_ddp_pending = 0;
 		m_free(m);
 	} else 
 		SBAPPEND(rcv, m);
 
 	state = so_state_get(so);	
 	if (__predict_true((state & SS_NOFDREF) == 0))
 		so_sorwakeup_locked(so);
 	else
 		sockbuf_unlock(rcv);
 }
 
 /*
  * Process a CPL_GET_TCB_RPL.  These can also be generated by the DDP code,
  * in that case they are similar to DDP completions.
  */
 static int
 do_get_tcb_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx)
 {
 	struct toepcb *toep = (struct toepcb *)ctx;
 
 	/* OK if socket doesn't exist */
 	if (toep == NULL) {
 		printf("null toep in do_get_tcb_rpl\n");
 		return (CPL_RET_BUF_DONE);
 	}
 
 	inp_wlock(toep->tp_tp->t_inpcb);
 	tcb_rpl_as_ddp_complete(toep, m);
 	inp_wunlock(toep->tp_tp->t_inpcb);
 	
 	return (0);
 }
 
 static void
 handle_ddp_data(struct toepcb *toep, struct mbuf *m)
 {
 	struct tcpcb *tp = toep->tp_tp;
 	struct socket *so;
 	struct ddp_state *q;
 	struct ddp_buf_state *bsp;
 	struct cpl_rx_data *hdr = cplhdr(m);
 	unsigned int rcv_nxt = ntohl(hdr->seq);
 	struct sockbuf *rcv;	
 	
 	if (tp->rcv_nxt == rcv_nxt)
 		return;
 
 	inp_lock_assert(tp->t_inpcb);
 	so  = inp_inpcbtosocket(tp->t_inpcb);
 	rcv = so_sockbuf_rcv(so);	
 	sockbuf_lock(rcv);	
 
 	q = &toep->tp_ddp_state;
 	bsp = &q->buf_state[q->cur_buf];
 	KASSERT(SEQ_GT(rcv_nxt, tp->rcv_nxt), ("tp->rcv_nxt=0x%08x decreased rcv_nxt=0x08%x",
 		rcv_nxt, tp->rcv_nxt));
 	m->m_len = m->m_pkthdr.len = rcv_nxt - tp->rcv_nxt;
 	KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
 	CTR3(KTR_TOM, "rcv_nxt=0x%x tp->rcv_nxt=0x%x len=%d",
 	    rcv_nxt, tp->rcv_nxt, m->m_pkthdr.len);
 
 #ifdef T3_TRACE
 	if ((int)m->m_pkthdr.len < 0) {
 		t3_ddp_error(so, "handle_ddp_data: neg len");
 	}
 #endif
 	m->m_ddp_gl = (unsigned char *)bsp->gl;
 	m->m_flags |= M_DDP;
 	m->m_cur_offset = bsp->cur_offset;
 	m->m_ddp_flags = DDP_BF_PSH | (bsp->flags & DDP_BF_NOCOPY) | 1;
 	if (bsp->flags & DDP_BF_NOCOPY)
 		bsp->flags &= ~DDP_BF_NOCOPY;
 
 	m->m_seq = tp->rcv_nxt;
 	tp->rcv_nxt = rcv_nxt;
 	bsp->cur_offset += m->m_pkthdr.len;
 	if (!(bsp->flags & DDP_BF_NOFLIP))
 		q->cur_buf ^= 1;
 	/*
 	 * For now, don't re-enable DDP after a connection fell out of  DDP
 	 * mode.
 	 */
 	q->ubuf_ddp_ready = 0;
 	sockbuf_unlock(rcv);
 }
 
 /*
  * Process new data received for a connection.
  */
 static void
 new_rx_data(struct toepcb *toep, struct mbuf *m)
 {
 	struct cpl_rx_data *hdr = cplhdr(m);
 	struct tcpcb *tp = toep->tp_tp;
 	struct socket *so;
 	struct sockbuf *rcv;	
 	int state;
 	int len = be16toh(hdr->len);
 
 	inp_wlock(tp->t_inpcb);
 
 	so  = inp_inpcbtosocket(tp->t_inpcb);
 	
 	if (__predict_false(so_no_receive(so))) {
 		handle_excess_rx(toep, m);
 		inp_wunlock(tp->t_inpcb);
 		TRACE_EXIT;
 		return;
 	}
 
 	if (toep->tp_ulp_mode == ULP_MODE_TCPDDP)
 		handle_ddp_data(toep, m);
 	
 	m->m_seq = ntohl(hdr->seq);
 	m->m_ulp_mode = 0;                    /* for iSCSI */
 
 #if VALIDATE_SEQ
 	if (__predict_false(m->m_seq != tp->rcv_nxt)) {
 		log(LOG_ERR,
 		       "%s: TID %u: Bad sequence number %u, expected %u\n",
 		    toep->tp_toedev->name, toep->tp_tid, m->m_seq,
 		       tp->rcv_nxt);
 		m_freem(m);
 		inp_wunlock(tp->t_inpcb);
 		return;
 	}
 #endif
 	m_adj(m, sizeof(*hdr));
 
 #ifdef URGENT_DATA_SUPPORTED
 	/*
 	 * We don't handle urgent data yet
 	 */
 	if (__predict_false(hdr->urg))
 		handle_urg_ptr(so, tp->rcv_nxt + ntohs(hdr->urg));
 	if (__predict_false(tp->urg_data == TCP_URG_NOTYET &&
 		     tp->urg_seq - tp->rcv_nxt < skb->len))
 		tp->urg_data = TCP_URG_VALID | skb->data[tp->urg_seq -
 							 tp->rcv_nxt];
 #endif	
 	if (__predict_false(hdr->dack_mode != toep->tp_delack_mode)) {
 		toep->tp_delack_mode = hdr->dack_mode;
 		toep->tp_delack_seq = tp->rcv_nxt;
 	}
 	CTR6(KTR_TOM, "appending mbuf=%p pktlen=%d m_len=%d len=%d rcv_nxt=0x%x enqueued_bytes=%d",
 	    m, m->m_pkthdr.len, m->m_len, len, tp->rcv_nxt, toep->tp_enqueued_bytes);
 	
 	if (len < m->m_pkthdr.len)
 		m->m_pkthdr.len = m->m_len = len;
 
 	tp->rcv_nxt += m->m_pkthdr.len;
 	tp->t_rcvtime = ticks;
 	toep->tp_enqueued_bytes += m->m_pkthdr.len;
 	CTR2(KTR_TOM,
 	    "new_rx_data: seq 0x%x len %u",
 	    m->m_seq, m->m_pkthdr.len);
 	inp_wunlock(tp->t_inpcb);
 	rcv = so_sockbuf_rcv(so);
 	sockbuf_lock(rcv);
 #if 0	
 	if (sb_notify(rcv))
 		DPRINTF("rx_data so=%p flags=0x%x len=%d\n", so, rcv->sb_flags, m->m_pkthdr.len);
 #endif
 	SBAPPEND(rcv, m);
 
 #ifdef notyet
 	/*
 	 * We're giving too many credits to the card - but disable this check so we can keep on moving :-|
 	 *
 	 */
 	KASSERT(rcv->sb_cc < (rcv->sb_mbmax << 1),
 
 	    ("so=%p, data contents exceed mbmax, sb_cc=%d sb_mbmax=%d",
 		so, rcv->sb_cc, rcv->sb_mbmax));
 #endif
 	
 
 	CTR2(KTR_TOM, "sb_cc=%d sb_mbcnt=%d",
 	    rcv->sb_cc, rcv->sb_mbcnt);
 	
 	state = so_state_get(so);	
 	if (__predict_true((state & SS_NOFDREF) == 0))
 		so_sorwakeup_locked(so);
 	else
 		sockbuf_unlock(rcv);
 }
 
 /*
  * Handler for RX_DATA CPL messages.
  */
 static int
 do_rx_data(struct t3cdev *cdev, struct mbuf *m, void *ctx)
 {
 	struct toepcb *toep = (struct toepcb *)ctx;
 
 	DPRINTF("rx_data len=%d\n", m->m_pkthdr.len);
 	
 	new_rx_data(toep, m);
 
 	return (0);
 }
 
 static void
 new_rx_data_ddp(struct toepcb *toep, struct mbuf *m)
 {
 	struct tcpcb *tp;
 	struct ddp_state *q;
 	struct ddp_buf_state *bsp;
 	struct cpl_rx_data_ddp *hdr;
 	struct socket *so;	
 	unsigned int ddp_len, rcv_nxt, ddp_report, end_offset, buf_idx;
 	int nomoredata = 0;
 	unsigned int delack_mode;
 	struct sockbuf *rcv;
 	
 	tp = toep->tp_tp;	
 	inp_wlock(tp->t_inpcb);
 	so = inp_inpcbtosocket(tp->t_inpcb);
 
 	if (__predict_false(so_no_receive(so))) {
 
 		handle_excess_rx(toep, m);
 		inp_wunlock(tp->t_inpcb);
 		return;
 	}
 	
 	q = &toep->tp_ddp_state;
 	hdr = cplhdr(m);
 	ddp_report = ntohl(hdr->u.ddp_report);
 	buf_idx = (ddp_report >> S_DDP_BUF_IDX) & 1;
 	bsp = &q->buf_state[buf_idx];
 
 	CTR4(KTR_TOM,
 	    "new_rx_data_ddp: tp->rcv_nxt 0x%x cur_offset %u "
 	    "hdr seq 0x%x len %u",
 	    tp->rcv_nxt, bsp->cur_offset, ntohl(hdr->seq),
 	    ntohs(hdr->len));
 	CTR3(KTR_TOM,
 	    "new_rx_data_ddp: offset %u ddp_report 0x%x buf_idx=%d",
 	    G_DDP_OFFSET(ddp_report), ddp_report, buf_idx);
 	
 	ddp_len = ntohs(hdr->len);
 	rcv_nxt = ntohl(hdr->seq) + ddp_len;
 
 	delack_mode = G_DDP_DACK_MODE(ddp_report);
 	if (__predict_false(G_DDP_DACK_MODE(ddp_report) != toep->tp_delack_mode)) {
 		toep->tp_delack_mode = delack_mode;
 		toep->tp_delack_seq = tp->rcv_nxt;
 	}
 	
 	m->m_seq = tp->rcv_nxt;
 	tp->rcv_nxt = rcv_nxt;
 
 	tp->t_rcvtime = ticks;
 	/*
 	 * Store the length in m->m_len.  We are changing the meaning of
 	 * m->m_len here, we need to be very careful that nothing from now on
 	 * interprets ->len of this packet the usual way.
 	 */
 	m->m_len = m->m_pkthdr.len = rcv_nxt - m->m_seq;
 	inp_wunlock(tp->t_inpcb);
 	CTR3(KTR_TOM,
 	    "new_rx_data_ddp: m_len=%u rcv_next 0x%08x rcv_nxt_prev=0x%08x ",
 	    m->m_len, rcv_nxt, m->m_seq);
 	/*
 	 * Figure out where the new data was placed in the buffer and store it
 	 * in when.  Assumes the buffer offset starts at 0, consumer needs to
 	 * account for page pod's pg_offset.
 	 */
 	end_offset = G_DDP_OFFSET(ddp_report) + ddp_len;
 	m->m_cur_offset = end_offset - m->m_pkthdr.len;
 
 	rcv = so_sockbuf_rcv(so);
 	sockbuf_lock(rcv);	
 
 	m->m_ddp_gl = (unsigned char *)bsp->gl;
 	m->m_flags |= M_DDP;
 	bsp->cur_offset = end_offset;
 	toep->tp_enqueued_bytes += m->m_pkthdr.len;
 
 	/*
 	 * Length is only meaningful for kbuf
 	 */
 	if (!(bsp->flags & DDP_BF_NOCOPY))
 		KASSERT(m->m_len <= bsp->gl->dgl_length,
 		    ("length received exceeds ddp pages: len=%d dgl_length=%d",
 			m->m_len, bsp->gl->dgl_length));
 
 	KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
 	KASSERT(m->m_next == NULL, ("m_len=%p", m->m_next));
         /*
 	 * Bit 0 of flags stores whether the DDP buffer is completed.
 	 * Note that other parts of the code depend on this being in bit 0.
 	 */
 	if ((bsp->flags & DDP_BF_NOINVAL) && end_offset != bsp->gl->dgl_length) {
 		panic("spurious ddp completion");
 	} else {
 		m->m_ddp_flags = !!(ddp_report & F_DDP_BUF_COMPLETE);
 		if (m->m_ddp_flags && !(bsp->flags & DDP_BF_NOFLIP)) 
 			q->cur_buf ^= 1;                     /* flip buffers */
 	}
 
 	if (bsp->flags & DDP_BF_NOCOPY) {
 		m->m_ddp_flags |= (bsp->flags & DDP_BF_NOCOPY);
 		bsp->flags &= ~DDP_BF_NOCOPY;
 	}
 
 	if (ddp_report & F_DDP_PSH)
 		m->m_ddp_flags |= DDP_BF_PSH;
 	if (nomoredata)
 		m->m_ddp_flags |= DDP_BF_NODATA;
 
 #ifdef notyet	
 	skb_reset_transport_header(skb);
 	tcp_hdr(skb)->fin = 0;          /* changes original hdr->ddp_report */
 #endif
 	SBAPPEND(rcv, m);
 
 	if ((so_state_get(so) & SS_NOFDREF) == 0 && ((ddp_report & F_DDP_PSH) ||
 	    (((m->m_ddp_flags & (DDP_BF_NOCOPY|1)) == (DDP_BF_NOCOPY|1))
 		|| !(m->m_ddp_flags & DDP_BF_NOCOPY))))
 		so_sorwakeup_locked(so);
 	else
 		sockbuf_unlock(rcv);
 }
 
 #define DDP_ERR (F_DDP_PPOD_MISMATCH | F_DDP_LLIMIT_ERR | F_DDP_ULIMIT_ERR |\
 		 F_DDP_PPOD_PARITY_ERR | F_DDP_PADDING_ERR | F_DDP_OFFSET_ERR |\
 		 F_DDP_INVALID_TAG | F_DDP_COLOR_ERR | F_DDP_TID_MISMATCH |\
 		 F_DDP_INVALID_PPOD)
 
 /*
  * Handler for RX_DATA_DDP CPL messages.
  */
 static int
 do_rx_data_ddp(struct t3cdev *cdev, struct mbuf *m, void *ctx)
 {
 	struct toepcb *toep = ctx;
 	const struct cpl_rx_data_ddp *hdr = cplhdr(m);
 
 	VALIDATE_SOCK(so);
 
 	if (__predict_false(ntohl(hdr->ddpvld_status) & DDP_ERR)) {
 		log(LOG_ERR, "RX_DATA_DDP for TID %u reported error 0x%x\n",
 		       GET_TID(hdr), G_DDP_VALID(ntohl(hdr->ddpvld_status)));
 		return (CPL_RET_BUF_DONE);
 	}
 #if 0
 	skb->h.th = tcphdr_skb->h.th;
 #endif	
 	new_rx_data_ddp(toep, m);
 	return (0);
 }
 
 static void
 process_ddp_complete(struct toepcb *toep, struct mbuf *m)
 {
 	struct tcpcb *tp = toep->tp_tp;
 	struct socket *so;
 	struct ddp_state *q;
 	struct ddp_buf_state *bsp;
 	struct cpl_rx_ddp_complete *hdr;
 	unsigned int ddp_report, buf_idx, when, delack_mode;
 	int nomoredata = 0;
 	struct sockbuf *rcv;
 	
 	inp_wlock(tp->t_inpcb);
 	so = inp_inpcbtosocket(tp->t_inpcb);
 
 	if (__predict_false(so_no_receive(so))) {
 		struct inpcb *inp = so_sotoinpcb(so);
 
 		handle_excess_rx(toep, m);
 		inp_wunlock(inp);
 		return;
 	}
 	q = &toep->tp_ddp_state; 
 	hdr = cplhdr(m);
 	ddp_report = ntohl(hdr->ddp_report);
 	buf_idx = (ddp_report >> S_DDP_BUF_IDX) & 1;
 	m->m_pkthdr.csum_data = tp->rcv_nxt;
 
 	rcv = so_sockbuf_rcv(so);
 	sockbuf_lock(rcv);
 
 	bsp = &q->buf_state[buf_idx];
 	when = bsp->cur_offset;
 	m->m_len = m->m_pkthdr.len = G_DDP_OFFSET(ddp_report) - when;
 	tp->rcv_nxt += m->m_len;
 	tp->t_rcvtime = ticks;
 
 	delack_mode = G_DDP_DACK_MODE(ddp_report);
 	if (__predict_false(G_DDP_DACK_MODE(ddp_report) != toep->tp_delack_mode)) {
 		toep->tp_delack_mode = delack_mode;
 		toep->tp_delack_seq = tp->rcv_nxt;
 	}
 #ifdef notyet
 	skb_reset_transport_header(skb);
 	tcp_hdr(skb)->fin = 0;          /* changes valid memory past CPL */
 #endif
 	inp_wunlock(tp->t_inpcb);
 
 	KASSERT(m->m_len >= 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
 	CTR5(KTR_TOM,
 		  "process_ddp_complete: tp->rcv_nxt 0x%x cur_offset %u "
 		  "ddp_report 0x%x offset %u, len %u",
 		  tp->rcv_nxt, bsp->cur_offset, ddp_report,
 		   G_DDP_OFFSET(ddp_report), m->m_len);
 
 	m->m_cur_offset = bsp->cur_offset;
 	bsp->cur_offset += m->m_len;
 
 	if (!(bsp->flags & DDP_BF_NOFLIP)) {
 		q->cur_buf ^= 1;                     /* flip buffers */
 		if (G_DDP_OFFSET(ddp_report) < q->kbuf[0]->dgl_length)
 			nomoredata=1;
 	}
 		
 	CTR4(KTR_TOM,
 		  "process_ddp_complete: tp->rcv_nxt 0x%x cur_offset %u "
 		  "ddp_report %u offset %u",
 		  tp->rcv_nxt, bsp->cur_offset, ddp_report,
 		   G_DDP_OFFSET(ddp_report));
 	
 	m->m_ddp_gl = (unsigned char *)bsp->gl;
 	m->m_flags |= M_DDP;
 	m->m_ddp_flags = (bsp->flags & DDP_BF_NOCOPY) | 1;
 	if (bsp->flags & DDP_BF_NOCOPY)
 		bsp->flags &= ~DDP_BF_NOCOPY;
 	if (nomoredata)
 		m->m_ddp_flags |= DDP_BF_NODATA;
 
 	SBAPPEND(rcv, m);
 	if ((so_state_get(so) & SS_NOFDREF) == 0)
 		so_sorwakeup_locked(so);
 	else
 		sockbuf_unlock(rcv);
 }
 
 /*
  * Handler for RX_DDP_COMPLETE CPL messages.
  */
 static int
 do_rx_ddp_complete(struct t3cdev *cdev, struct mbuf *m, void *ctx)
 {
 	struct toepcb *toep = ctx;
 
 	VALIDATE_SOCK(so);
 #if 0
 	skb->h.th = tcphdr_skb->h.th;
 #endif	
 	process_ddp_complete(toep, m);
 	return (0);
 }
 
 /*
  * Move a socket to TIME_WAIT state.  We need to make some adjustments to the
  * socket state before calling tcp_time_wait to comply with its expectations.
  */
 static void
 enter_timewait(struct tcpcb *tp)
 {
 	/*
 	 * Bump rcv_nxt for the peer FIN.  We don't do this at the time we
 	 * process peer_close because we don't want to carry the peer FIN in
 	 * the socket's receive queue and if we increment rcv_nxt without
 	 * having the FIN in the receive queue we'll confuse facilities such
 	 * as SIOCINQ.
 	 */
 	inp_wlock(tp->t_inpcb);	
 	tp->rcv_nxt++;
 
 	tp->ts_recent_age = 0;	     /* defeat recycling */
 	tp->t_srtt = 0;                        /* defeat tcp_update_metrics */
 	inp_wunlock(tp->t_inpcb);
 	tcp_offload_twstart(tp);
 }
 
 /*
  * For TCP DDP a PEER_CLOSE may also be an implicit RX_DDP_COMPLETE.  This
  * function deals with the data that may be reported along with the FIN.
  * Returns -1 if no further processing of the PEER_CLOSE is needed, >= 0 to
  * perform normal FIN-related processing.  In the latter case 1 indicates that
  * there was an implicit RX_DDP_COMPLETE and the skb should not be freed, 0 the
  * skb can be freed.
  */
 static int
 handle_peer_close_data(struct socket *so, struct mbuf *m)
 {
 	struct tcpcb *tp = so_sototcpcb(so);
 	struct toepcb *toep = tp->t_toe;
 	struct ddp_state *q;
 	struct ddp_buf_state *bsp;
 	struct cpl_peer_close *req = cplhdr(m);
 	unsigned int rcv_nxt = ntohl(req->rcv_nxt) - 1; /* exclude FIN */
 	struct sockbuf *rcv;
 	
 	if (tp->rcv_nxt == rcv_nxt)			/* no data */
 		return (0);
 
 	CTR0(KTR_TOM, "handle_peer_close_data");
 	if (__predict_false(so_no_receive(so))) {
 		handle_excess_rx(toep, m);
 
 		/*
 		 * Although we discard the data we want to process the FIN so
 		 * that PEER_CLOSE + data behaves the same as RX_DATA_DDP +
 		 * PEER_CLOSE without data.  In particular this PEER_CLOSE
 		 * may be what will close the connection.  We return 1 because
 		 * handle_excess_rx() already freed the packet.
 		 */
 		return (1);
 	}
 
 	inp_lock_assert(tp->t_inpcb);
 	q = &toep->tp_ddp_state;
 	rcv = so_sockbuf_rcv(so);
 	sockbuf_lock(rcv);
 
 	bsp = &q->buf_state[q->cur_buf];
 	m->m_len = m->m_pkthdr.len = rcv_nxt - tp->rcv_nxt;
 	KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
 	m->m_ddp_gl = (unsigned char *)bsp->gl;
 	m->m_flags |= M_DDP;
 	m->m_cur_offset = bsp->cur_offset;
 	m->m_ddp_flags = 
 	    DDP_BF_PSH | (bsp->flags & DDP_BF_NOCOPY) | 1;
 	m->m_seq = tp->rcv_nxt;
 	tp->rcv_nxt = rcv_nxt;
 	bsp->cur_offset += m->m_pkthdr.len;
 	if (!(bsp->flags & DDP_BF_NOFLIP))
 		q->cur_buf ^= 1;
 #ifdef notyet	
 	skb_reset_transport_header(skb);
 	tcp_hdr(skb)->fin = 0;          /* changes valid memory past CPL */
 #endif	
 	tp->t_rcvtime = ticks;
 	SBAPPEND(rcv, m);
 	if (__predict_true((so_state_get(so) & SS_NOFDREF) == 0))
 		so_sorwakeup_locked(so);
 	else
 		sockbuf_unlock(rcv);
 
 	return (1);
 }
 
 /*
  * Handle a peer FIN.
  */
 static void
 do_peer_fin(struct toepcb *toep, struct mbuf *m)
 {
 	struct socket *so;
 	struct tcpcb *tp = toep->tp_tp;
 	int keep, action;
 	
 	action = keep = 0;	
 	CTR1(KTR_TOM, "do_peer_fin state=%d", tp->t_state);
 	if (!is_t3a(toep->tp_toedev) && (toep->tp_flags & TP_ABORT_RPL_PENDING)) {
 		printf("abort_pending set\n");
 		
 		goto out;
 	}
 	inp_wlock(tp->t_inpcb);
 	so = inp_inpcbtosocket(toep->tp_tp->t_inpcb);
 	if (toep->tp_ulp_mode == ULP_MODE_TCPDDP) {
 		keep = handle_peer_close_data(so, m);
 		if (keep < 0) {
 			inp_wunlock(tp->t_inpcb);					
 			return;
 		}
 	}
 	if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
 		CTR1(KTR_TOM,
 		    "waking up waiters for cantrcvmore on %p ", so);	
 		socantrcvmore(so);
 
 		/*
 		 * If connection is half-synchronized
 		 * (ie NEEDSYN flag on) then delay ACK,
 		 * so it may be piggybacked when SYN is sent.
 		 * Otherwise, since we received a FIN then no
 		 * more input can be expected, send ACK now.
 		 */
 		if (tp->t_flags & TF_NEEDSYN)
 			tp->t_flags |= TF_DELACK;
 		else
 			tp->t_flags |= TF_ACKNOW;
 		tp->rcv_nxt++;
 	}
 	
 	switch (tp->t_state) {
 	case TCPS_SYN_RECEIVED:
 	    tp->t_starttime = ticks;
 	/* FALLTHROUGH */ 
 	case TCPS_ESTABLISHED:
 		tp->t_state = TCPS_CLOSE_WAIT;
 		break;
 	case TCPS_FIN_WAIT_1:
 		tp->t_state = TCPS_CLOSING;
 		break;
 	case TCPS_FIN_WAIT_2:
 		/*
 		 * If we've sent an abort_req we must have sent it too late,
 		 * HW will send us a reply telling us so, and this peer_close
 		 * is really the last message for this connection and needs to
 		 * be treated as an abort_rpl, i.e., transition the connection
 		 * to TCP_CLOSE (note that the host stack does this at the
 		 * time of generating the RST but we must wait for HW).
 		 * Otherwise we enter TIME_WAIT.
 		 */
 		t3_release_offload_resources(toep);
 		if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
 			action = TCP_CLOSE;
 		} else {
 			action = TCP_TIMEWAIT;			
 		}
 		break;
 	default:
 		log(LOG_ERR,
 		       "%s: TID %u received PEER_CLOSE in bad state %d\n",
 		    toep->tp_toedev->tod_name, toep->tp_tid, tp->t_state);
 	}
 	inp_wunlock(tp->t_inpcb);					
 
 	if (action == TCP_TIMEWAIT) {
 		enter_timewait(tp);
 	} else if (action == TCP_DROP) {
 		tcp_offload_drop(tp, 0);		
 	} else if (action == TCP_CLOSE) {
 		tcp_offload_close(tp);		
 	}
 
 #ifdef notyet		
 	/* Do not send POLL_HUP for half duplex close. */
 	if ((sk->sk_shutdown & SEND_SHUTDOWN) ||
 	    sk->sk_state == TCP_CLOSE)
 		sk_wake_async(so, 1, POLL_HUP);
 	else
 		sk_wake_async(so, 1, POLL_IN);
 #endif
 
 out:
 	if (!keep)
 		m_free(m);
 }
 
 /*
  * Handler for PEER_CLOSE CPL messages.
  */
 static int
 do_peer_close(struct t3cdev *cdev, struct mbuf *m, void *ctx)
 {
 	struct toepcb *toep = (struct toepcb *)ctx;
 
 	VALIDATE_SOCK(so);
 
 	do_peer_fin(toep, m);
 	return (0);
 }
 
 static void
 process_close_con_rpl(struct toepcb *toep, struct mbuf *m)
 {
 	struct cpl_close_con_rpl *rpl = cplhdr(m);
 	struct tcpcb *tp = toep->tp_tp;	
 	struct socket *so;	
 	int action = 0;
 	struct sockbuf *rcv;	
 	
 	inp_wlock(tp->t_inpcb);
 	so = inp_inpcbtosocket(tp->t_inpcb);	
 	
 	tp->snd_una = ntohl(rpl->snd_nxt) - 1;  /* exclude FIN */
 
 	if (!is_t3a(toep->tp_toedev) && (toep->tp_flags & TP_ABORT_RPL_PENDING)) {
 		inp_wunlock(tp->t_inpcb);
 		goto out;
 	}
 	
 	CTR3(KTR_TOM, "process_close_con_rpl(%p) state=%d dead=%d", toep, 
 	    tp->t_state, !!(so_state_get(so) & SS_NOFDREF));
 
 	switch (tp->t_state) {
 	case TCPS_CLOSING:              /* see FIN_WAIT2 case in do_peer_fin */
 		t3_release_offload_resources(toep);
 		if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
 			action = TCP_CLOSE;
 
 		} else {
 			action = TCP_TIMEWAIT;
 		}
 		break;
 	case TCPS_LAST_ACK:
 		/*
 		 * In this state we don't care about pending abort_rpl.
 		 * If we've sent abort_req it was post-close and was sent too
 		 * late, this close_con_rpl is the actual last message.
 		 */
 		t3_release_offload_resources(toep);
 		action = TCP_CLOSE;
 		break;
 	case TCPS_FIN_WAIT_1:
 		/*
 		 * If we can't receive any more
 		 * data, then closing user can proceed.
 		 * Starting the timer is contrary to the
 		 * specification, but if we don't get a FIN
 		 * we'll hang forever.
 		 *
 		 * XXXjl:
 		 * we should release the tp also, and use a
 		 * compressed state.
 		 */
 		if (so)
 			rcv = so_sockbuf_rcv(so);
 		else
 			break;
 		
 		if (rcv->sb_state & SBS_CANTRCVMORE) {
 			int timeout;
 
 			if (so)
 				soisdisconnected(so);
 			timeout = (tcp_fast_finwait2_recycle) ? 
 			    tcp_finwait2_timeout : tcp_maxidle;
 			tcp_timer_activate(tp, TT_2MSL, timeout);
 		}
 		tp->t_state = TCPS_FIN_WAIT_2;
 		if ((so_options_get(so) & SO_LINGER) && so_linger_get(so) == 0 &&
 		    (toep->tp_flags & TP_ABORT_SHUTDOWN) == 0) {
 			action = TCP_DROP;
 		}
 
 		break;
 	default:
 		log(LOG_ERR,
 		       "%s: TID %u received CLOSE_CON_RPL in bad state %d\n",
 		       toep->tp_toedev->tod_name, toep->tp_tid,
 		       tp->t_state);
 	}
 	inp_wunlock(tp->t_inpcb);
 
 
 	if (action == TCP_TIMEWAIT) {
 		enter_timewait(tp);
 	} else if (action == TCP_DROP) {
 		tcp_offload_drop(tp, 0);		
 	} else if (action == TCP_CLOSE) {
 		tcp_offload_close(tp);		
 	}
 out:
 	m_freem(m);
 }
 
 /*
  * Handler for CLOSE_CON_RPL CPL messages.
  */
 static int
 do_close_con_rpl(struct t3cdev *cdev, struct mbuf *m,
 			    void *ctx)
 {
 	struct toepcb *toep = (struct toepcb *)ctx;
 
 	process_close_con_rpl(toep, m);
 	return (0);
 }
 
 /*
  * Process abort replies.  We only process these messages if we anticipate
  * them as the coordination between SW and HW in this area is somewhat lacking
  * and sometimes we get ABORT_RPLs after we are done with the connection that
  * originated the ABORT_REQ.
  */
 static void
 process_abort_rpl(struct toepcb *toep, struct mbuf *m)
 {
 	struct tcpcb *tp = toep->tp_tp;
 	struct socket *so;	
 	int needclose = 0;
 	
 #ifdef T3_TRACE
 	T3_TRACE1(TIDTB(sk),
 		  "process_abort_rpl: GTS rpl pending %d",
 		  sock_flag(sk, ABORT_RPL_PENDING));
 #endif
 	
 	inp_wlock(tp->t_inpcb);
 	so = inp_inpcbtosocket(tp->t_inpcb);
 	
 	if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
 		/*
 		 * XXX panic on tcpdrop
 		 */
 		if (!(toep->tp_flags & TP_ABORT_RPL_RCVD) && !is_t3a(toep->tp_toedev))
 			toep->tp_flags |= TP_ABORT_RPL_RCVD;
 		else {
 			toep->tp_flags &= ~(TP_ABORT_RPL_RCVD|TP_ABORT_RPL_PENDING);
 			if (!(toep->tp_flags & TP_ABORT_REQ_RCVD) ||
 			    !is_t3a(toep->tp_toedev)) {
 				if (toep->tp_flags & TP_ABORT_REQ_RCVD)
 					panic("TP_ABORT_REQ_RCVD set");
 				t3_release_offload_resources(toep);
 				needclose = 1;
 			}
 		}
 	}
 	inp_wunlock(tp->t_inpcb);
 
 	if (needclose)
 		tcp_offload_close(tp);
 
 	m_free(m);
 }
 
 /*
  * Handle an ABORT_RPL_RSS CPL message.
  */
 static int
 do_abort_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx)
 {
 	struct cpl_abort_rpl_rss *rpl = cplhdr(m);
 	struct toepcb *toep;
 	
 	/*
 	 * Ignore replies to post-close aborts indicating that the abort was
 	 * requested too late.  These connections are terminated when we get
 	 * PEER_CLOSE or CLOSE_CON_RPL and by the time the abort_rpl_rss
 	 * arrives the TID is either no longer used or it has been recycled.
 	 */
 	if (rpl->status == CPL_ERR_ABORT_FAILED) {
 discard:
 		m_free(m);
 		return (0);
 	}
 
 	toep = (struct toepcb *)ctx;
 	
         /*
 	 * Sometimes we've already closed the socket, e.g., a post-close
 	 * abort races with ABORT_REQ_RSS, the latter frees the socket
 	 * expecting the ABORT_REQ will fail with CPL_ERR_ABORT_FAILED,
 	 * but FW turns the ABORT_REQ into a regular one and so we get
 	 * ABORT_RPL_RSS with status 0 and no socket.  Only on T3A.
 	 */
 	if (!toep)
 		goto discard;
 
 	if (toep->tp_tp == NULL) {
 		log(LOG_NOTICE, "removing tid for abort\n");
 		cxgb_remove_tid(cdev, toep, toep->tp_tid);
 		if (toep->tp_l2t) 
 			l2t_release(L2DATA(cdev), toep->tp_l2t);
 
 		toepcb_release(toep);
 		goto discard;
 	}
 	
 	log(LOG_NOTICE, "toep=%p\n", toep);
 	log(LOG_NOTICE, "tp=%p\n", toep->tp_tp);
 
 	toepcb_hold(toep);
 	process_abort_rpl(toep, m);
 	toepcb_release(toep);
 	return (0);
 }
 
 /*
  * Convert the status code of an ABORT_REQ into a FreeBSD error code.  Also
  * indicate whether RST should be sent in response.
  */
 static int
 abort_status_to_errno(struct socket *so, int abort_reason, int *need_rst)
 {
 	struct tcpcb *tp = so_sototcpcb(so);
 
 	switch (abort_reason) {
 	case CPL_ERR_BAD_SYN:
 #if 0		
 		NET_INC_STATS_BH(LINUX_MIB_TCPABORTONSYN);	// fall through
 #endif		
 	case CPL_ERR_CONN_RESET:
 		// XXX need to handle SYN_RECV due to crossed SYNs
 		return (tp->t_state == TCPS_CLOSE_WAIT ? EPIPE : ECONNRESET);
 	case CPL_ERR_XMIT_TIMEDOUT:
 	case CPL_ERR_PERSIST_TIMEDOUT:
 	case CPL_ERR_FINWAIT2_TIMEDOUT:
 	case CPL_ERR_KEEPALIVE_TIMEDOUT:
 #if 0		
 		NET_INC_STATS_BH(LINUX_MIB_TCPABORTONTIMEOUT);
 #endif		
 		return (ETIMEDOUT);
 	default:
 		return (EIO);
 	}
 }
 
 static inline void
 set_abort_rpl_wr(struct mbuf *m, unsigned int tid, int cmd)
 {
 	struct cpl_abort_rpl *rpl = cplhdr(m);
 
 	rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_RPL));
 	rpl->wr.wr_lo = htonl(V_WR_TID(tid));
 	m->m_len = m->m_pkthdr.len = sizeof(*rpl);
 	
 	OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_ABORT_RPL, tid));
 	rpl->cmd = cmd;
 }
 
 static void
 send_deferred_abort_rpl(struct toedev *tdev, struct mbuf *m)
 {
 	struct mbuf *reply_mbuf;
 	struct cpl_abort_req_rss *req = cplhdr(m);
 
 	reply_mbuf = m_gethdr_nofail(sizeof(struct cpl_abort_rpl));
 	m_set_priority(m, CPL_PRIORITY_DATA);
 	m->m_len = m->m_pkthdr.len = sizeof(struct cpl_abort_rpl);
 	set_abort_rpl_wr(reply_mbuf, GET_TID(req), req->status);
 	cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf);
 	m_free(m);
 }
 
 /*
  * Returns whether an ABORT_REQ_RSS message is a negative advice.
  */
 static inline int
 is_neg_adv_abort(unsigned int status)
 {
 	return status == CPL_ERR_RTX_NEG_ADVICE ||
 	    status == CPL_ERR_PERSIST_NEG_ADVICE;
 }
 
 static void
 send_abort_rpl(struct mbuf *m, struct toedev *tdev, int rst_status)
 {
 	struct mbuf  *reply_mbuf;
 	struct cpl_abort_req_rss *req = cplhdr(m);
 
 	reply_mbuf = m_gethdr(M_NOWAIT, MT_DATA);
 
 	if (!reply_mbuf) {
 		/* Defer the reply.  Stick rst_status into req->cmd. */
 		req->status = rst_status;
 		t3_defer_reply(m, tdev, send_deferred_abort_rpl);
 		return;
 	}
 
 	m_set_priority(reply_mbuf, CPL_PRIORITY_DATA);
 	set_abort_rpl_wr(reply_mbuf, GET_TID(req), rst_status);
 	m_free(m);
 
 	/*
 	 * XXX need to sync with ARP as for SYN_RECV connections we can send
 	 * these messages while ARP is pending.  For other connection states
 	 * it's not a problem.
 	 */
 	cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf);
 }
 
 #ifdef notyet
 static void
 cleanup_syn_rcv_conn(struct socket *child, struct socket *parent)
 {
 	CXGB_UNIMPLEMENTED();
 #ifdef notyet	
 	struct request_sock *req = child->sk_user_data;
 
 	inet_csk_reqsk_queue_removed(parent, req);
 	synq_remove(tcp_sk(child));
 	__reqsk_free(req);
 	child->sk_user_data = NULL;
 #endif
 }
 
 
 /*
  * Performs the actual work to abort a SYN_RECV connection.
  */
 static void
 do_abort_syn_rcv(struct socket *child, struct socket *parent)
 {
 	struct tcpcb *parenttp = so_sototcpcb(parent);
 	struct tcpcb *childtp = so_sototcpcb(child);
 
 	/*
 	 * If the server is still open we clean up the child connection,
 	 * otherwise the server already did the clean up as it was purging
 	 * its SYN queue and the skb was just sitting in its backlog.
 	 */
 	if (__predict_false(parenttp->t_state == TCPS_LISTEN)) {
 		cleanup_syn_rcv_conn(child, parent);
 		inp_wlock(childtp->t_inpcb);
 		t3_release_offload_resources(childtp->t_toe);
 		inp_wunlock(childtp->t_inpcb);
 		tcp_offload_close(childtp);
 	}
 }
 #endif
 
 /*
  * Handle abort requests for a SYN_RECV connection.  These need extra work
  * because the socket is on its parent's SYN queue.
  */
 static int
 abort_syn_rcv(struct socket *so, struct mbuf *m)
 {
 	CXGB_UNIMPLEMENTED();
 #ifdef notyet	
 	struct socket *parent;
 	struct toedev *tdev = toep->tp_toedev;
 	struct t3cdev *cdev = TOM_DATA(tdev)->cdev;
 	struct socket *oreq = so->so_incomp;
 	struct t3c_tid_entry *t3c_stid;
 	struct tid_info *t;
 
 	if (!oreq)
 		return -1;        /* somehow we are not on the SYN queue */
 
 	t = &(T3C_DATA(cdev))->tid_maps;
 	t3c_stid = lookup_stid(t, oreq->ts_recent);
 	parent = ((struct listen_ctx *)t3c_stid->ctx)->lso;
 
 	so_lock(parent);
 	do_abort_syn_rcv(so, parent);
 	send_abort_rpl(m, tdev, CPL_ABORT_NO_RST);
 	so_unlock(parent);
 #endif
 	return (0);
 }
 
 /*
  * Process abort requests.  If we are waiting for an ABORT_RPL we ignore this
  * request except that we need to reply to it.
  */
 static void
 process_abort_req(struct toepcb *toep, struct mbuf *m, struct toedev *tdev)
 {
 	int rst_status = CPL_ABORT_NO_RST;
 	const struct cpl_abort_req_rss *req = cplhdr(m);
 	struct tcpcb *tp = toep->tp_tp; 
 	struct socket *so;
 	int needclose = 0;
 	
 	inp_wlock(tp->t_inpcb);
 	so = inp_inpcbtosocket(toep->tp_tp->t_inpcb);
 	if ((toep->tp_flags & TP_ABORT_REQ_RCVD) == 0) {
 		toep->tp_flags |= (TP_ABORT_REQ_RCVD|TP_ABORT_SHUTDOWN);
 		m_free(m);
 		goto skip;
 	}
 
 	toep->tp_flags &= ~TP_ABORT_REQ_RCVD;
 	/*
 	 * Three cases to consider:
 	 * a) We haven't sent an abort_req; close the connection.
 	 * b) We have sent a post-close abort_req that will get to TP too late
 	 *    and will generate a CPL_ERR_ABORT_FAILED reply.  The reply will
 	 *    be ignored and the connection should be closed now.
 	 * c) We have sent a regular abort_req that will get to TP too late.
 	 *    That will generate an abort_rpl with status 0, wait for it.
 	 */
 	if (((toep->tp_flags & TP_ABORT_RPL_PENDING) == 0) ||
 	    (is_t3a(toep->tp_toedev) && (toep->tp_flags & TP_CLOSE_CON_REQUESTED))) {
 		int error;
 		
 		error = abort_status_to_errno(so, req->status,
 		    &rst_status);
 		so_error_set(so, error);
 
 		if (__predict_true((so_state_get(so) & SS_NOFDREF) == 0))
 			so_sorwakeup(so);
 		/*
 		 * SYN_RECV needs special processing.  If abort_syn_rcv()
 		 * returns 0 is has taken care of the abort.
 		 */
 		if ((tp->t_state == TCPS_SYN_RECEIVED) && !abort_syn_rcv(so, m))
 			goto skip;
 
 		t3_release_offload_resources(toep);
 		needclose = 1;
 	}
 	inp_wunlock(tp->t_inpcb);
 
 	if (needclose)
 		tcp_offload_close(tp);
 
 	send_abort_rpl(m, tdev, rst_status);
 	return;
 skip:
 	inp_wunlock(tp->t_inpcb);	
 }
 
 /*
  * Handle an ABORT_REQ_RSS CPL message.
  */
 static int
 do_abort_req(struct t3cdev *cdev, struct mbuf *m, void *ctx)
 {
 	const struct cpl_abort_req_rss *req = cplhdr(m);
 	struct toepcb *toep = (struct toepcb *)ctx;
 	
 	if (is_neg_adv_abort(req->status)) {
 		m_free(m);
 		return (0);
 	}
 
 	log(LOG_NOTICE, "aborting tid=%d\n", toep->tp_tid);
 	
 	if ((toep->tp_flags & (TP_SYN_RCVD|TP_ABORT_REQ_RCVD)) == TP_SYN_RCVD) {
 		cxgb_remove_tid(cdev, toep, toep->tp_tid);
 		toep->tp_flags |= TP_ABORT_REQ_RCVD;
 		
 		send_abort_rpl(m, toep->tp_toedev, CPL_ABORT_NO_RST);
 		if (toep->tp_l2t) 
 			l2t_release(L2DATA(cdev), toep->tp_l2t);
 
 		/*
 		 *  Unhook
 		 */
 		toep->tp_tp->t_toe = NULL;
 		toep->tp_tp->t_flags &= ~TF_TOE;
 		toep->tp_tp = NULL;
 		/*
 		 * XXX need to call syncache_chkrst - but we don't
 		 * have a way of doing that yet
 		 */
 		toepcb_release(toep);
 		log(LOG_ERR, "abort for unestablished connection :-(\n");
 		return (0);
 	}
 	if (toep->tp_tp == NULL) {
 		log(LOG_NOTICE, "disconnected toepcb\n");
 		/* should be freed momentarily */
 		return (0);
 	}
 
 
 	toepcb_hold(toep);
 	process_abort_req(toep, m, toep->tp_toedev);
 	toepcb_release(toep);
 	return (0);
 }
 #ifdef notyet
 static void
 pass_open_abort(struct socket *child, struct socket *parent, struct mbuf *m)
 {
 	struct toedev *tdev = TOE_DEV(parent);
 
 	do_abort_syn_rcv(child, parent);
 	if (tdev->tod_ttid == TOE_ID_CHELSIO_T3) {
 		struct cpl_pass_accept_rpl *rpl = cplhdr(m);
 
 		rpl->opt0h = htonl(F_TCAM_BYPASS);
 		rpl->opt0l_status = htonl(CPL_PASS_OPEN_REJECT);
 		cxgb_ofld_send(TOM_DATA(tdev)->cdev, m);
 	} else
 		m_free(m);
 }
 #endif
 static void
 handle_pass_open_arp_failure(struct socket *so, struct mbuf *m)
 {
 	CXGB_UNIMPLEMENTED();
 	
 #ifdef notyet	
 	struct t3cdev *cdev;
 	struct socket *parent;
 	struct socket *oreq;
 	struct t3c_tid_entry *t3c_stid;
 	struct tid_info *t;
 	struct tcpcb *otp, *tp = so_sototcpcb(so);
 	struct toepcb *toep = tp->t_toe;
 	
 	/*
 	 * If the connection is being aborted due to the parent listening
 	 * socket going away there's nothing to do, the ABORT_REQ will close
 	 * the connection.
 	 */
 	if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
 		m_free(m);
 		return;
 	}
 
 	oreq = so->so_incomp;
 	otp = so_sototcpcb(oreq);
 	
 	cdev = T3C_DEV(so);
 	t = &(T3C_DATA(cdev))->tid_maps;
 	t3c_stid = lookup_stid(t, otp->ts_recent);
 	parent = ((struct listen_ctx *)t3c_stid->ctx)->lso;
 
 	so_lock(parent);
 	pass_open_abort(so, parent, m);
 	so_unlock(parent);
 #endif	
 }
 
 /*
  * Handle an ARP failure for a CPL_PASS_ACCEPT_RPL.  This is treated similarly
  * to an ABORT_REQ_RSS in SYN_RECV as both events need to tear down a SYN_RECV
  * connection.
  */
 static void
 pass_accept_rpl_arp_failure(struct t3cdev *cdev, struct mbuf *m)
 {
 
 #ifdef notyet	
 	TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
 	BLOG_SKB_CB(skb)->dev = TOE_DEV(skb->sk);
 #endif
 	handle_pass_open_arp_failure(m_get_socket(m), m);
 }
 
 /*
  * Populate a reject CPL_PASS_ACCEPT_RPL WR.
  */
 static void
 mk_pass_accept_rpl(struct mbuf *reply_mbuf, struct mbuf *req_mbuf)
 {
 	struct cpl_pass_accept_req *req = cplhdr(req_mbuf);
 	struct cpl_pass_accept_rpl *rpl = cplhdr(reply_mbuf);
 	unsigned int tid = GET_TID(req);
 
 	m_set_priority(reply_mbuf, CPL_PRIORITY_SETUP);
 	rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
 	OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, tid));
 	rpl->peer_ip = req->peer_ip;   // req->peer_ip not overwritten yet
 	rpl->opt0h = htonl(F_TCAM_BYPASS);
 	rpl->opt0l_status = htonl(CPL_PASS_OPEN_REJECT);
 	rpl->opt2 = 0;
 	rpl->rsvd = rpl->opt2;   /* workaround for HW bug */
 }
 
 /*
  * Send a deferred reject to an accept request.
  */
 static void
 reject_pass_request(struct toedev *tdev, struct mbuf *m)
 {
 	struct mbuf *reply_mbuf;
 
 	reply_mbuf = m_gethdr_nofail(sizeof(struct cpl_pass_accept_rpl));
 	mk_pass_accept_rpl(reply_mbuf, m);
 	cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf);
 	m_free(m);
 }
 
 static void
 handle_syncache_event(int event, void *arg)
 {
 	struct toepcb *toep = arg;
 
 	switch (event) {
 	case TOE_SC_ENTRY_PRESENT:
 		/*
 		 * entry already exists - free toepcb
 		 * and l2t
 		 */
 		printf("syncache entry present\n");
 		toepcb_release(toep);
 		break;
 	case TOE_SC_DROP:
 		/*
 		 * The syncache has given up on this entry
 		 * either it timed out, or it was evicted
 		 * we need to explicitly release the tid
 		 */
 		printf("syncache entry dropped\n");
 		toepcb_release(toep);		
 		break;
 	default:
 		log(LOG_ERR, "unknown syncache event %d\n", event);
 		break;
 	}
 }
 
 static void
 syncache_add_accept_req(struct cpl_pass_accept_req *req, struct socket *lso, struct toepcb *toep)
 {
 	struct in_conninfo inc;
 	struct tcpopt to;
 	struct tcphdr th;
 	struct inpcb *inp;
 	int mss, wsf, sack, ts;
 	uint32_t rcv_isn = ntohl(req->rcv_isn);
 	
 	bzero(&to, sizeof(struct tcpopt));
 	inp = so_sotoinpcb(lso);
 	
 	/*
 	 * Fill out information for entering us into the syncache
 	 */
 	bzero(&inc, sizeof(inc));
 	inc.inc_fport = th.th_sport = req->peer_port;
 	inc.inc_lport = th.th_dport = req->local_port;
 	th.th_seq = req->rcv_isn;
 	th.th_flags = TH_SYN;
 
 	toep->tp_iss = toep->tp_delack_seq = toep->tp_rcv_wup = toep->tp_copied_seq = rcv_isn + 1;
 
-	
-	inc.inc_isipv6 = 0;
 	inc.inc_len = 0;
 	inc.inc_faddr.s_addr = req->peer_ip;
 	inc.inc_laddr.s_addr = req->local_ip;
 
 	DPRINTF("syncache add of %d:%d %d:%d\n",
 	    ntohl(req->local_ip), ntohs(req->local_port),
 	    ntohl(req->peer_ip), ntohs(req->peer_port));
 	
 	mss = req->tcp_options.mss;
 	wsf = req->tcp_options.wsf;
 	ts = req->tcp_options.tstamp;
 	sack = req->tcp_options.sack;
 	to.to_mss = mss;
 	to.to_wscale = wsf;
 	to.to_flags = (mss ? TOF_MSS : 0) | (wsf ? TOF_SCALE : 0) | (ts ? TOF_TS : 0) | (sack ? TOF_SACKPERM : 0);
 	tcp_offload_syncache_add(&inc, &to, &th, inp, &lso, &cxgb_toe_usrreqs, toep);
 }
 
 
 /*
  * Process a CPL_PASS_ACCEPT_REQ message.  Does the part that needs the socket
  * lock held.  Note that the sock here is a listening socket that is not owned
  * by the TOE.
  */
 static void
 process_pass_accept_req(struct socket *so, struct mbuf *m, struct toedev *tdev,
     struct listen_ctx *lctx)
 {
 	int rt_flags;
 	struct l2t_entry *e;
 	struct iff_mac tim;
 	struct mbuf *reply_mbuf, *ddp_mbuf = NULL;
 	struct cpl_pass_accept_rpl *rpl;
 	struct cpl_pass_accept_req *req = cplhdr(m);
 	unsigned int tid = GET_TID(req);
 	struct tom_data *d = TOM_DATA(tdev);
 	struct t3cdev *cdev = d->cdev;
 	struct tcpcb *tp = so_sototcpcb(so);
 	struct toepcb *newtoep;
 	struct rtentry *dst;
 	struct sockaddr_in nam;
 	struct t3c_data *td = T3C_DATA(cdev);
 
 	reply_mbuf = m_gethdr(M_NOWAIT, MT_DATA);
 	if (__predict_false(reply_mbuf == NULL)) {
 		if (tdev->tod_ttid == TOE_ID_CHELSIO_T3)
 			t3_defer_reply(m, tdev, reject_pass_request);
 		else {
 			cxgb_queue_tid_release(cdev, tid);
 			m_free(m);
 		}
 		DPRINTF("failed to get reply_mbuf\n");
 		
 		goto out;
 	}
 
 	if (tp->t_state != TCPS_LISTEN) {
 		DPRINTF("socket not in listen state\n");
 		
 		goto reject;
 	}
 	
 	tim.mac_addr = req->dst_mac;
 	tim.vlan_tag = ntohs(req->vlan_tag);
 	if (cdev->ctl(cdev, GET_IFF_FROM_MAC, &tim) < 0 || !tim.dev) {
 		DPRINTF("rejecting from failed GET_IFF_FROM_MAC\n");
 		goto reject;
 	}
 	
 #ifdef notyet
 	/*
 	 * XXX do route lookup to confirm that we're still listening on this
 	 * address
 	 */
 	if (ip_route_input(skb, req->local_ip, req->peer_ip,
 			   G_PASS_OPEN_TOS(ntohl(req->tos_tid)), tim.dev))
 		goto reject;
 	rt_flags = ((struct rtable *)skb->dst)->rt_flags &
 		(RTCF_BROADCAST | RTCF_MULTICAST | RTCF_LOCAL);
 	dst_release(skb->dst);	// done with the input route, release it
 	skb->dst = NULL;
 	
 	if ((rt_flags & RTF_LOCAL) == 0)
 		goto reject;
 #endif
 	/*
 	 * XXX
 	 */
 	rt_flags = RTF_LOCAL;
 	if ((rt_flags & RTF_LOCAL) == 0)
 		goto reject;
 	
 	/*
 	 * Calculate values and add to syncache
 	 */
 
 	newtoep = toepcb_alloc();
 	if (newtoep == NULL)
 		goto reject;
 
 	bzero(&nam, sizeof(struct sockaddr_in));
 	
 	nam.sin_len = sizeof(struct sockaddr_in);
 	nam.sin_family = AF_INET;
 	nam.sin_addr.s_addr =req->peer_ip;
 	dst = rtalloc2((struct sockaddr *)&nam, 1, 0);
 
 	if (dst == NULL) {
 		printf("failed to find route\n");
 		goto reject;
 	}
 	e = newtoep->tp_l2t = t3_l2t_get(d->cdev, dst, tim.dev,
 	    (struct sockaddr *)&nam);
 	if (e == NULL) {
 		DPRINTF("failed to get l2t\n");
 	}
 	/*
 	 * Point to our listen socket until accept
 	 */
 	newtoep->tp_tp = tp;
 	newtoep->tp_flags = TP_SYN_RCVD;
 	newtoep->tp_tid = tid;
 	newtoep->tp_toedev = tdev;
 	tp->rcv_wnd = select_rcv_wnd(tdev, so);
 	
 	cxgb_insert_tid(cdev, d->client, newtoep, tid);
 	so_lock(so);
 	LIST_INSERT_HEAD(&lctx->synq_head, newtoep, synq_entry);
 	so_unlock(so);
 
 	newtoep->tp_ulp_mode = TOM_TUNABLE(tdev, ddp) && !(so_options_get(so) & SO_NO_DDP) &&
 		       tp->rcv_wnd >= MIN_DDP_RCV_WIN ? ULP_MODE_TCPDDP : 0;
 
 	if (newtoep->tp_ulp_mode) {
 		ddp_mbuf = m_gethdr(M_NOWAIT, MT_DATA);
 		
 		if (ddp_mbuf == NULL)
 			newtoep->tp_ulp_mode = 0;
 	}
 	
 	CTR4(KTR_TOM, "ddp=%d rcv_wnd=%ld min_win=%d ulp_mode=%d",
 	    TOM_TUNABLE(tdev, ddp), tp->rcv_wnd, MIN_DDP_RCV_WIN, newtoep->tp_ulp_mode);
 	set_arp_failure_handler(reply_mbuf, pass_accept_rpl_arp_failure);
 	/*
 	 * XXX workaround for lack of syncache drop
 	 */
 	toepcb_hold(newtoep);
 	syncache_add_accept_req(req, so, newtoep);
 	
 	rpl = cplhdr(reply_mbuf);
 	reply_mbuf->m_pkthdr.len = reply_mbuf->m_len = sizeof(*rpl);
 	rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
 	rpl->wr.wr_lo = 0;
 	OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, tid));
 	rpl->opt2 = htonl(calc_opt2(so, tdev));
 	rpl->rsvd = rpl->opt2;                /* workaround for HW bug */
 	rpl->peer_ip = req->peer_ip;	// req->peer_ip is not overwritten
 
 	rpl->opt0h = htonl(calc_opt0h(so, select_mss(td, NULL, dst->rt_ifp->if_mtu)) |
 	    V_L2T_IDX(e->idx) | V_TX_CHANNEL(e->smt_idx));
 	rpl->opt0l_status = htonl(calc_opt0l(so, newtoep->tp_ulp_mode) |
 				  CPL_PASS_OPEN_ACCEPT);
 
 	DPRINTF("opt0l_status=%08x\n", rpl->opt0l_status);
 	
 	m_set_priority(reply_mbuf, mkprio(CPL_PRIORITY_SETUP, newtoep));
 		
 	l2t_send(cdev, reply_mbuf, e);
 	m_free(m);
 	if (newtoep->tp_ulp_mode) {	
 		__set_tcb_field(newtoep, ddp_mbuf, W_TCB_RX_DDP_FLAGS,
 				V_TF_DDP_OFF(1) |
 				TP_DDP_TIMER_WORKAROUND_MASK,
 				V_TF_DDP_OFF(1) |
 		    TP_DDP_TIMER_WORKAROUND_VAL, 1);
 	} else
 		DPRINTF("no DDP\n");
 
 	return;
 reject:
 	if (tdev->tod_ttid == TOE_ID_CHELSIO_T3)
 		mk_pass_accept_rpl(reply_mbuf, m);
 	else 
 		mk_tid_release(reply_mbuf, newtoep, tid);
 	cxgb_ofld_send(cdev, reply_mbuf);
 	m_free(m);
 out:
 #if 0
 	TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
 #else
 	return;
 #endif	
 }      
 
 /*
  * Handle a CPL_PASS_ACCEPT_REQ message.
  */
 static int
 do_pass_accept_req(struct t3cdev *cdev, struct mbuf *m, void *ctx)
 {
 	struct listen_ctx *listen_ctx = (struct listen_ctx *)ctx;
 	struct socket *lso = listen_ctx->lso; /* XXX need an interlock against the listen socket going away */
 	struct tom_data *d = listen_ctx->tom_data;
 
 #if VALIDATE_TID
 	struct cpl_pass_accept_req *req = cplhdr(m);
 	unsigned int tid = GET_TID(req);
 	struct tid_info *t = &(T3C_DATA(cdev))->tid_maps;
 
 	if (unlikely(!lsk)) {
 		printk(KERN_ERR "%s: PASS_ACCEPT_REQ had unknown STID %lu\n",
 		       cdev->name,
 		       (unsigned long)((union listen_entry *)ctx -
 					t->stid_tab));
 		return CPL_RET_BUF_DONE;
 	}
 	if (unlikely(tid >= t->ntids)) {
 		printk(KERN_ERR "%s: passive open TID %u too large\n",
 		       cdev->name, tid);
 		return CPL_RET_BUF_DONE;
 	}
 	/*
 	 * For T3A the current user of the TID may have closed but its last
 	 * message(s) may have been backlogged so the TID appears to be still
 	 * in use.  Just take the TID away, the connection can close at its
 	 * own leisure.  For T3B this situation is a bug.
 	 */
 	if (!valid_new_tid(t, tid) &&
 	    cdev->type != T3A) {
 		printk(KERN_ERR "%s: passive open uses existing TID %u\n",
 		       cdev->name, tid);
 		return CPL_RET_BUF_DONE;
 	}
 #endif
 
 	process_pass_accept_req(lso, m, &d->tdev, listen_ctx);
 	return (0);
 }
 
 /*
  * Called when a connection is established to translate the TCP options
  * reported by HW to FreeBSD's native format.
  */
 static void
 assign_rxopt(struct socket *so, unsigned int opt)
 {
 	struct tcpcb *tp = so_sototcpcb(so);
 	struct toepcb *toep = tp->t_toe;
 	const struct t3c_data *td = T3C_DATA(TOEP_T3C_DEV(toep));
 
 	inp_lock_assert(tp->t_inpcb);
 	
 	toep->tp_mss_clamp = td->mtus[G_TCPOPT_MSS(opt)] - 40;
 	tp->t_flags         |= G_TCPOPT_TSTAMP(opt) ? TF_RCVD_TSTMP : 0;
 	tp->t_flags         |= G_TCPOPT_SACK(opt) ? TF_SACK_PERMIT : 0;
 	tp->t_flags 	    |= G_TCPOPT_WSCALE_OK(opt) ? TF_RCVD_SCALE : 0;
 	if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
 	    (TF_RCVD_SCALE|TF_REQ_SCALE))
 		tp->rcv_scale = tp->request_r_scale;
 }
 
 /*
  * Completes some final bits of initialization for just established connections
  * and changes their state to TCP_ESTABLISHED.
  *
  * snd_isn here is the ISN after the SYN, i.e., the true ISN + 1.
  */
 static void
 make_established(struct socket *so, u32 snd_isn, unsigned int opt)
 {
 	struct tcpcb *tp = so_sototcpcb(so);
 	struct toepcb *toep = tp->t_toe;
 	
 	toep->tp_write_seq = tp->iss = tp->snd_max = tp->snd_nxt = tp->snd_una = snd_isn;
 	assign_rxopt(so, opt);
 
 	/*
 	 *XXXXXXXXXXX
 	 * 
 	 */
 #ifdef notyet
 	so->so_proto->pr_ctloutput = t3_ctloutput;
 #endif
 	
 #if 0	
 	inet_sk(sk)->id = tp->write_seq ^ jiffies;
 #endif	
 	/*
 	 * XXX not clear what rcv_wup maps to
 	 */
 	/*
 	 * Causes the first RX_DATA_ACK to supply any Rx credits we couldn't
 	 * pass through opt0.
 	 */
 	if (tp->rcv_wnd > (M_RCV_BUFSIZ << 10))
 		toep->tp_rcv_wup -= tp->rcv_wnd - (M_RCV_BUFSIZ << 10);
 
 	dump_toepcb(toep);
 
 #ifdef notyet
 /*
  * no clean interface for marking ARP up to date
  */
 	dst_confirm(sk->sk_dst_cache);
 #endif
 	tp->t_starttime = ticks;
 	tp->t_state = TCPS_ESTABLISHED;
 	soisconnected(so);
 }
 
 static int
 syncache_expand_establish_req(struct cpl_pass_establish *req, struct socket **so, struct toepcb *toep)
 {
 
 	struct in_conninfo inc;
 	struct tcpopt to;
 	struct tcphdr th;
 	int mss, wsf, sack, ts;
 	struct mbuf *m = NULL;
 	const struct t3c_data *td = T3C_DATA(TOM_DATA(toep->tp_toedev)->cdev);
 	unsigned int opt;
 	
 #ifdef MAC
 #error	"no MAC support"
 #endif	
 	
 	opt = ntohs(req->tcp_opt);
 	
 	bzero(&to, sizeof(struct tcpopt));
 	
 	/*
 	 * Fill out information for entering us into the syncache
 	 */
 	bzero(&inc, sizeof(inc));
 	inc.inc_fport = th.th_sport = req->peer_port;
 	inc.inc_lport = th.th_dport = req->local_port;
 	th.th_seq = req->rcv_isn;
 	th.th_flags = TH_ACK;
 	
-	inc.inc_isipv6 = 0;
 	inc.inc_len = 0;
 	inc.inc_faddr.s_addr = req->peer_ip;
 	inc.inc_laddr.s_addr = req->local_ip;
 	
 	mss  = td->mtus[G_TCPOPT_MSS(opt)] - 40;
 	wsf  = G_TCPOPT_WSCALE_OK(opt);
 	ts   = G_TCPOPT_TSTAMP(opt);
 	sack = G_TCPOPT_SACK(opt);
 	
 	to.to_mss = mss;
 	to.to_wscale =  G_TCPOPT_SND_WSCALE(opt);
 	to.to_flags = (mss ? TOF_MSS : 0) | (wsf ? TOF_SCALE : 0) | (ts ? TOF_TS : 0) | (sack ? TOF_SACKPERM : 0);
 
 	DPRINTF("syncache expand of %d:%d %d:%d mss:%d wsf:%d ts:%d sack:%d\n",
 	    ntohl(req->local_ip), ntohs(req->local_port),
 	    ntohl(req->peer_ip), ntohs(req->peer_port),
 	    mss, wsf, ts, sack);
 	return tcp_offload_syncache_expand(&inc, &to, &th, so, m);
 }
 
 
 /*
  * Process a CPL_PASS_ESTABLISH message.  XXX a lot of the locking doesn't work
  * if we are in TCP_SYN_RECV due to crossed SYNs
  */
 static int
 do_pass_establish(struct t3cdev *cdev, struct mbuf *m, void *ctx)
 {
 	struct cpl_pass_establish *req = cplhdr(m);
 	struct toepcb *toep = (struct toepcb *)ctx;
 	struct tcpcb *tp = toep->tp_tp;
 	struct socket *so, *lso;
 	struct t3c_data *td = T3C_DATA(cdev);
 	struct sockbuf *snd, *rcv;
 	
 	// Complete socket initialization now that we have the SND_ISN
 	
 	struct toedev *tdev;
 
 
 	tdev = toep->tp_toedev;
 
 	inp_wlock(tp->t_inpcb);
 	
 	/*
 	 *
 	 * XXX need to add reference while we're manipulating
 	 */
 	so = lso = inp_inpcbtosocket(tp->t_inpcb);
 
 	inp_wunlock(tp->t_inpcb);
 
 	so_lock(so);
 	LIST_REMOVE(toep, synq_entry);
 	so_unlock(so);
 	
 	if (!syncache_expand_establish_req(req, &so, toep)) {
 		/*
 		 * No entry 
 		 */
 		CXGB_UNIMPLEMENTED();
 	}
 	if (so == NULL) {
 		/*
 		 * Couldn't create the socket
 		 */
 		CXGB_UNIMPLEMENTED();
 	}
 
 	tp = so_sototcpcb(so);
 	inp_wlock(tp->t_inpcb);
 
 	snd = so_sockbuf_snd(so);
 	rcv = so_sockbuf_rcv(so);
 
 	snd->sb_flags |= SB_NOCOALESCE;
 	rcv->sb_flags |= SB_NOCOALESCE;
 
 	toep->tp_tp = tp;
 	toep->tp_flags = 0;
 	tp->t_toe = toep;
 	reset_wr_list(toep);
 	tp->rcv_wnd = select_rcv_wnd(tdev, so);
 	tp->rcv_nxt = toep->tp_copied_seq;
 	install_offload_ops(so);
 	
 	toep->tp_wr_max = toep->tp_wr_avail = TOM_TUNABLE(tdev, max_wrs);
 	toep->tp_wr_unacked = 0;
 	toep->tp_qset = G_QNUM(ntohl(m->m_pkthdr.csum_data));
 	toep->tp_qset_idx = 0;
 	toep->tp_mtu_idx = select_mss(td, tp, toep->tp_l2t->neigh->rt_ifp->if_mtu);
 	
 	/*
 	 * XXX Cancel any keep alive timer
 	 */
 	     
 	make_established(so, ntohl(req->snd_isn), ntohs(req->tcp_opt));
 
 	/*
 	 * XXX workaround for lack of syncache drop
 	 */
 	toepcb_release(toep);
 	inp_wunlock(tp->t_inpcb);
 	
 	CTR1(KTR_TOM, "do_pass_establish tid=%u", toep->tp_tid);
 	cxgb_log_tcb(cdev->adapter, toep->tp_tid);
 #ifdef notyet
 	/*
 	 * XXX not sure how these checks map to us
 	 */
 	if (unlikely(sk->sk_socket)) {   // simultaneous opens only
 		sk->sk_state_change(sk);
 		sk_wake_async(so, 0, POLL_OUT);
 	}
 	/*
 	 * The state for the new connection is now up to date.
 	 * Next check if we should add the connection to the parent's
 	 * accept queue.  When the parent closes it resets connections
 	 * on its SYN queue, so check if we are being reset.  If so we
 	 * don't need to do anything more, the coming ABORT_RPL will
 	 * destroy this socket.  Otherwise move the connection to the
 	 * accept queue.
 	 *
 	 * Note that we reset the synq before closing the server so if
 	 * we are not being reset the stid is still open.
 	 */
 	if (unlikely(!tp->forward_skb_hint)) { // removed from synq
 		__kfree_skb(skb);
 		goto unlock;
 	}
 #endif
 	m_free(m);
 
 	return (0);
 }
 
 /*
  * Fill in the right TID for CPL messages waiting in the out-of-order queue
  * and send them to the TOE.
  */
 static void
 fixup_and_send_ofo(struct toepcb *toep)
 {
 	struct mbuf *m;
 	struct toedev *tdev = toep->tp_toedev;
 	struct tcpcb *tp = toep->tp_tp;
 	unsigned int tid = toep->tp_tid;
 
 	log(LOG_NOTICE, "fixup_and_send_ofo\n");
 	
 	inp_lock_assert(tp->t_inpcb);
 	while ((m = mbufq_dequeue(&toep->out_of_order_queue)) != NULL) {
 		/*
 		 * A variety of messages can be waiting but the fields we'll
 		 * be touching are common to all so any message type will do.
 		 */
 		struct cpl_close_con_req *p = cplhdr(m);
 
 		p->wr.wr_lo = htonl(V_WR_TID(tid));
 		OPCODE_TID(p) = htonl(MK_OPCODE_TID(p->ot.opcode, tid));
 		cxgb_ofld_send(TOM_DATA(tdev)->cdev, m);
 	}
 }
 
 /*
  * Updates socket state from an active establish CPL message.  Runs with the
  * socket lock held.
  */
 static void
 socket_act_establish(struct socket *so, struct mbuf *m)
 {
 	INIT_VNET_INET(so->so_vnet);
 	struct cpl_act_establish *req = cplhdr(m);
 	u32 rcv_isn = ntohl(req->rcv_isn);	/* real RCV_ISN + 1 */
 	struct tcpcb *tp = so_sototcpcb(so);
 	struct toepcb *toep = tp->t_toe;
 	
 	if (__predict_false(tp->t_state != TCPS_SYN_SENT))
 		log(LOG_ERR, "TID %u expected SYN_SENT, found %d\n",
 		    toep->tp_tid, tp->t_state);
 
 	tp->ts_recent_age = ticks;
 	tp->irs = tp->rcv_wnd = tp->rcv_nxt = rcv_isn;
 	toep->tp_delack_seq = toep->tp_rcv_wup = toep->tp_copied_seq = tp->irs;
 
 	make_established(so, ntohl(req->snd_isn), ntohs(req->tcp_opt));
 	
 	/*
 	 * Now that we finally have a TID send any CPL messages that we had to
 	 * defer for lack of a TID.
 	 */
 	if (mbufq_len(&toep->out_of_order_queue))
 		fixup_and_send_ofo(toep);
 
 	if (__predict_false(so_state_get(so) & SS_NOFDREF)) {
 		/*
 		 * XXX does this even make sense?
 		 */
 		so_sorwakeup(so);
 	}
 	m_free(m);
 #ifdef notyet
 /*
  * XXX assume no write requests permitted while socket connection is
  * incomplete
  */
 	/*
 	 * Currently the send queue must be empty at this point because the
 	 * socket layer does not send anything before a connection is
 	 * established.  To be future proof though we handle the possibility
 	 * that there are pending buffers to send (either TX_DATA or
 	 * CLOSE_CON_REQ).  First we need to adjust the sequence number of the
 	 * buffers according to the just learned write_seq, and then we send
 	 * them on their way.
 	 */
 	fixup_pending_writeq_buffers(sk);
 	if (t3_push_frames(so, 1))
 		sk->sk_write_space(sk);
 #endif
 
 	toep->tp_state = tp->t_state;
 	V_tcpstat.tcps_connects++;
 				
 }
 
 /*
  * Process a CPL_ACT_ESTABLISH message.
  */
 static int
 do_act_establish(struct t3cdev *cdev, struct mbuf *m, void *ctx)
 {
 	struct cpl_act_establish *req = cplhdr(m);
 	unsigned int tid = GET_TID(req);
 	unsigned int atid = G_PASS_OPEN_TID(ntohl(req->tos_tid));
 	struct toepcb *toep = (struct toepcb *)ctx;
 	struct tcpcb *tp = toep->tp_tp;
 	struct socket *so; 
 	struct toedev *tdev;
 	struct tom_data *d;
 	
 	if (tp == NULL) {
 		free_atid(cdev, atid);
 		return (0);
 	}
 	inp_wlock(tp->t_inpcb);
 
 	/*
 	 * XXX
 	 */
 	so = inp_inpcbtosocket(tp->t_inpcb);
 	tdev = toep->tp_toedev; /* blow up here if link was down */
 	d = TOM_DATA(tdev);
 
 	/*
 	 * It's OK if the TID is currently in use, the owning socket may have
 	 * backlogged its last CPL message(s).  Just take it away.
 	 */
 	toep->tp_tid = tid;
 	toep->tp_tp = tp;
 	so_insert_tid(d, toep, tid);
 	free_atid(cdev, atid);
 	toep->tp_qset = G_QNUM(ntohl(m->m_pkthdr.csum_data));
 
 	socket_act_establish(so, m);
 	inp_wunlock(tp->t_inpcb);
 	CTR1(KTR_TOM, "do_act_establish tid=%u", toep->tp_tid);
 	cxgb_log_tcb(cdev->adapter, toep->tp_tid);
 
 	return (0);
 }
 
 /*
  * Process an acknowledgment of WR completion.  Advance snd_una and send the
  * next batch of work requests from the write queue.
  */
 static void
 wr_ack(struct toepcb *toep, struct mbuf *m)
 {
 	struct tcpcb *tp = toep->tp_tp;
 	struct cpl_wr_ack *hdr = cplhdr(m);
 	struct socket *so;
 	unsigned int credits = ntohs(hdr->credits);
 	u32 snd_una = ntohl(hdr->snd_una);
 	int bytes = 0;
 	struct sockbuf *snd;
 	
 	CTR2(KTR_SPARE2, "wr_ack: snd_una=%u credits=%d", snd_una, credits);
 
 	inp_wlock(tp->t_inpcb);
 	so = inp_inpcbtosocket(tp->t_inpcb);
 	toep->tp_wr_avail += credits;
 	if (toep->tp_wr_unacked > toep->tp_wr_max - toep->tp_wr_avail)
 		toep->tp_wr_unacked = toep->tp_wr_max - toep->tp_wr_avail;
 
 	while (credits) {
 		struct mbuf *p = peek_wr(toep);
 		
 		if (__predict_false(!p)) {
 			log(LOG_ERR, "%u WR_ACK credits for TID %u with "
 			    "nothing pending, state %u wr_avail=%u\n",
 			    credits, toep->tp_tid, tp->t_state, toep->tp_wr_avail);
 			break;
 		}
 		CTR2(KTR_TOM,
 			"wr_ack: p->credits=%d p->bytes=%d",
 		    p->m_pkthdr.csum_data, p->m_pkthdr.len);
 		KASSERT(p->m_pkthdr.csum_data != 0,
 		    ("empty request still on list"));
 
 		if (__predict_false(credits < p->m_pkthdr.csum_data)) {
 
 #if DEBUG_WR > 1
 			struct tx_data_wr *w = cplhdr(p);
 			log(LOG_ERR,
 			       "TID %u got %u WR credits, need %u, len %u, "
 			       "main body %u, frags %u, seq # %u, ACK una %u,"
 			       " ACK nxt %u, WR_AVAIL %u, WRs pending %u\n",
 			       toep->tp_tid, credits, p->csum, p->len,
 			       p->len - p->data_len, skb_shinfo(p)->nr_frags,
 			       ntohl(w->sndseq), snd_una, ntohl(hdr->snd_nxt),
 			    toep->tp_wr_avail, count_pending_wrs(tp) - credits);
 #endif
 			p->m_pkthdr.csum_data -= credits;
 			break;
 		} else {
 			dequeue_wr(toep);
 			credits -= p->m_pkthdr.csum_data;
 			bytes += p->m_pkthdr.len;
 			CTR3(KTR_TOM,
 			    "wr_ack: done with wr of %d bytes remain credits=%d wr credits=%d",
 			    p->m_pkthdr.len, credits, p->m_pkthdr.csum_data);
 	
 			m_free(p);
 		}
 	}
 
 #if DEBUG_WR
 	check_wr_invariants(tp);
 #endif
 
 	if (__predict_false(SEQ_LT(snd_una, tp->snd_una))) {
 #if VALIDATE_SEQ
 		struct tom_data *d = TOM_DATA(TOE_DEV(so));
 
 		log(LOG_ERR "%s: unexpected sequence # %u in WR_ACK "
 		    "for TID %u, snd_una %u\n", (&d->tdev)->name, snd_una,
 		    toep->tp_tid, tp->snd_una);
 #endif
 		goto out_free;
 	}
 
 	if (tp->snd_una != snd_una) {
 		tp->snd_una = snd_una;
 		tp->ts_recent_age = ticks;
 #ifdef notyet
 		/*
 		 * Keep ARP entry "minty fresh"
 		 */
 		dst_confirm(sk->sk_dst_cache);
 #endif
 		if (tp->snd_una == tp->snd_nxt)
 			toep->tp_flags &= ~TP_TX_WAIT_IDLE;
 	}
 
 	snd = so_sockbuf_snd(so);
 	if (bytes) {
 		CTR1(KTR_SPARE2, "wr_ack: sbdrop(%d)", bytes);
 		snd = so_sockbuf_snd(so);
 		sockbuf_lock(snd);		
 		sbdrop_locked(snd, bytes);
 		so_sowwakeup_locked(so);
 	}
 
 	if (snd->sb_sndptroff < snd->sb_cc)
 		t3_push_frames(so, 0);
 
 out_free:
 	inp_wunlock(tp->t_inpcb);
 	m_free(m);
 }
 
 /*
  * Handler for TX_DATA_ACK CPL messages.
  */
 static int
 do_wr_ack(struct t3cdev *dev, struct mbuf *m, void *ctx)
 {
 	struct toepcb *toep = (struct toepcb *)ctx;
 
 	VALIDATE_SOCK(so);
 
 	wr_ack(toep, m);
 	return 0;
 }
 
 /*
  * Handler for TRACE_PKT CPL messages.  Just sink these packets.
  */
 static int
 do_trace_pkt(struct t3cdev *dev, struct mbuf *m, void *ctx)
 {
 	m_freem(m);
 	return 0;
 }
 
 /*
  * Reset a connection that is on a listener's SYN queue or accept queue,
  * i.e., one that has not had a struct socket associated with it.
  * Must be called from process context.
  *
  * Modeled after code in inet_csk_listen_stop().
  */
 static void
 t3_reset_listen_child(struct socket *child)
 {
 	struct tcpcb *tp = so_sototcpcb(child);
 	
 	t3_send_reset(tp->t_toe);
 }
 
 
 static void
 t3_child_disconnect(struct socket *so, void *arg)
 {
 	struct tcpcb *tp = so_sototcpcb(so);
 		
 	if (tp->t_flags & TF_TOE) {
 		inp_wlock(tp->t_inpcb);
 		t3_reset_listen_child(so);
 		inp_wunlock(tp->t_inpcb);
 	}	
 }
 
 /*
  * Disconnect offloaded established but not yet accepted connections sitting
  * on a server's accept_queue.  We just send an ABORT_REQ at this point and
  * finish off the disconnect later as we may need to wait for the ABORT_RPL.
  */
 void
 t3_disconnect_acceptq(struct socket *listen_so)
 {
 
 	so_lock(listen_so);
 	so_listeners_apply_all(listen_so, t3_child_disconnect, NULL);
 	so_unlock(listen_so);
 }
 
 /*
  * Reset offloaded connections sitting on a server's syn queue.  As above
  * we send ABORT_REQ and finish off when we get ABORT_RPL.
  */
 
 void
 t3_reset_synq(struct listen_ctx *lctx)
 {
 	struct toepcb *toep;
 
 	so_lock(lctx->lso);	
 	while (!LIST_EMPTY(&lctx->synq_head)) {
 		toep = LIST_FIRST(&lctx->synq_head);
 		LIST_REMOVE(toep, synq_entry);
 		toep->tp_tp = NULL;
 		t3_send_reset(toep);
 		cxgb_remove_tid(TOEP_T3C_DEV(toep), toep, toep->tp_tid);
 		toepcb_release(toep);
 	}
 	so_unlock(lctx->lso); 
 }
 
 
 int
 t3_setup_ppods(struct toepcb *toep, const struct ddp_gather_list *gl,
 		   unsigned int nppods, unsigned int tag, unsigned int maxoff,
 		   unsigned int pg_off, unsigned int color)
 {
 	unsigned int i, j, pidx;
 	struct pagepod *p;
 	struct mbuf *m;
 	struct ulp_mem_io *req;
 	unsigned int tid = toep->tp_tid;
 	const struct tom_data *td = TOM_DATA(toep->tp_toedev);
 	unsigned int ppod_addr = tag * PPOD_SIZE + td->ddp_llimit;
 
 	CTR6(KTR_TOM, "t3_setup_ppods(gl=%p nppods=%u tag=%u maxoff=%u pg_off=%u color=%u)",
 	    gl, nppods, tag, maxoff, pg_off, color);
 	
 	for (i = 0; i < nppods; ++i) {
 		m = m_gethdr_nofail(sizeof(*req) + PPOD_SIZE);
 		m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
 		req = mtod(m, struct ulp_mem_io *);
 		m->m_pkthdr.len = m->m_len = sizeof(*req) + PPOD_SIZE;
 		req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS));
 		req->wr.wr_lo = 0;
 		req->cmd_lock_addr = htonl(V_ULP_MEMIO_ADDR(ppod_addr >> 5) |
 					   V_ULPTX_CMD(ULP_MEM_WRITE));
 		req->len = htonl(V_ULP_MEMIO_DATA_LEN(PPOD_SIZE / 32) |
 				 V_ULPTX_NFLITS(PPOD_SIZE / 8 + 1));
 
 		p = (struct pagepod *)(req + 1);
 		if (__predict_false(i < nppods - NUM_SENTINEL_PPODS)) {
 			p->pp_vld_tid = htonl(F_PPOD_VALID | V_PPOD_TID(tid));
 			p->pp_pgsz_tag_color = htonl(V_PPOD_TAG(tag) |
 						  V_PPOD_COLOR(color));
 			p->pp_max_offset = htonl(maxoff);
 			p->pp_page_offset = htonl(pg_off);
 			p->pp_rsvd = 0;
 			for (pidx = 4 * i, j = 0; j < 5; ++j, ++pidx)
 				p->pp_addr[j] = pidx < gl->dgl_nelem ?
 				    htobe64(VM_PAGE_TO_PHYS(gl->dgl_pages[pidx])) : 0;
 		} else
 			p->pp_vld_tid = 0;   /* mark sentinel page pods invalid */
 		send_or_defer(toep, m, 0);
 		ppod_addr += PPOD_SIZE;
 	}
 	return (0);
 }
 
 /*
  * Build a CPL_BARRIER message as payload of a ULP_TX_PKT command.
  */
 static inline void
 mk_cpl_barrier_ulp(struct cpl_barrier *b)
 {
 	struct ulp_txpkt *txpkt = (struct ulp_txpkt *)b;
 
 	txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
 	txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*b) / 8));
 	b->opcode = CPL_BARRIER;
 }
 
 /*
  * Build a CPL_GET_TCB message as payload of a ULP_TX_PKT command.
  */
 static inline void
 mk_get_tcb_ulp(struct cpl_get_tcb *req, unsigned int tid, unsigned int cpuno)
 {
 	struct ulp_txpkt *txpkt = (struct ulp_txpkt *)req;
 
 	txpkt = (struct ulp_txpkt *)req;
 	txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
 	txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*req) / 8));
 	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_GET_TCB, tid));
 	req->cpuno = htons(cpuno);
 }
 
 /*
  * Build a CPL_SET_TCB_FIELD message as payload of a ULP_TX_PKT command.
  */
 static inline void
 mk_set_tcb_field_ulp(struct cpl_set_tcb_field *req, unsigned int tid,
                      unsigned int word, uint64_t mask, uint64_t val)
 {
 	struct ulp_txpkt *txpkt = (struct ulp_txpkt *)req;
 	
 	CTR4(KTR_TCB, "mk_set_tcb_field_ulp(tid=%u word=0x%x mask=%jx val=%jx",
 	    tid, word, mask, val);
 	
 	txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
 	txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*req) / 8));
 	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, tid));
 	req->reply = V_NO_REPLY(1);
 	req->cpu_idx = 0;
 	req->word = htons(word);
 	req->mask = htobe64(mask);
 	req->val = htobe64(val);
 }
 
 /*
  * Build a CPL_RX_DATA_ACK message as payload of a ULP_TX_PKT command.
  */
 static void
 mk_rx_data_ack_ulp(struct toepcb *toep, struct cpl_rx_data_ack *ack,
     unsigned int tid, unsigned int credits)
 {
 	struct ulp_txpkt *txpkt = (struct ulp_txpkt *)ack;
 
 	txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
 	txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*ack) / 8));
 	OPCODE_TID(ack) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, tid));
 	ack->credit_dack = htonl(F_RX_MODULATE | F_RX_DACK_CHANGE |
 	    V_RX_DACK_MODE(TOM_TUNABLE(toep->tp_toedev, delack)) |
 				 V_RX_CREDITS(credits));
 }
 
 void
 t3_cancel_ddpbuf(struct toepcb *toep, unsigned int bufidx)
 {
 	unsigned int wrlen;
 	struct mbuf *m;
 	struct work_request_hdr *wr;
 	struct cpl_barrier *lock;
 	struct cpl_set_tcb_field *req;
 	struct cpl_get_tcb *getreq;
 	struct ddp_state *p = &toep->tp_ddp_state;
 
 #if 0
 	SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv);
 #endif
 	wrlen = sizeof(*wr) + sizeof(*req) + 2 * sizeof(*lock) +
 		sizeof(*getreq);
 	m = m_gethdr_nofail(wrlen);
 	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
 	wr = mtod(m, struct work_request_hdr *);
 	bzero(wr, wrlen);
 	
 	wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS));
 	m->m_pkthdr.len = m->m_len = wrlen;
 
 	lock = (struct cpl_barrier *)(wr + 1);
 	mk_cpl_barrier_ulp(lock);
 
 	req = (struct cpl_set_tcb_field *)(lock + 1);
 
 	CTR1(KTR_TCB, "t3_cancel_ddpbuf(bufidx=%u)", bufidx);
 
 	/* Hmmm, not sure if this actually a good thing: reactivating
 	 * the other buffer might be an issue if it has been completed
 	 * already. However, that is unlikely, since the fact that the UBUF
 	 * is not completed indicates that there is no oustanding data.
 	 */
 	if (bufidx == 0)
 		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
 				     V_TF_DDP_ACTIVE_BUF(1) |
 				     V_TF_DDP_BUF0_VALID(1),
 				     V_TF_DDP_ACTIVE_BUF(1));
 	else
 		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
 				     V_TF_DDP_ACTIVE_BUF(1) |
 				     V_TF_DDP_BUF1_VALID(1), 0);
 
 	getreq = (struct cpl_get_tcb *)(req + 1);
 	mk_get_tcb_ulp(getreq, toep->tp_tid, toep->tp_qset);
 
 	mk_cpl_barrier_ulp((struct cpl_barrier *)(getreq + 1));
 
 	/* Keep track of the number of oustanding CPL_GET_TCB requests
 	 */
 	p->get_tcb_count++;
 	
 #ifdef T3_TRACE
 	T3_TRACE1(TIDTB(so),
 		  "t3_cancel_ddpbuf: bufidx %u", bufidx);
 #endif
 	cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
 }
 
 /**
  * t3_overlay_ddpbuf - overlay an existing DDP buffer with a new one
  * @sk: the socket associated with the buffers
  * @bufidx: index of HW DDP buffer (0 or 1)
  * @tag0: new tag for HW buffer 0
  * @tag1: new tag for HW buffer 1
  * @len: new length for HW buf @bufidx
  *
  * Sends a compound WR to overlay a new DDP buffer on top of an existing
  * buffer by changing the buffer tag and length and setting the valid and
  * active flag accordingly.  The caller must ensure the new buffer is at
  * least as big as the existing one.  Since we typically reprogram both HW
  * buffers this function sets both tags for convenience. Read the TCB to
  * determine how made data was written into the buffer before the overlay
  * took place.
  */
 void
 t3_overlay_ddpbuf(struct toepcb *toep, unsigned int bufidx, unsigned int tag0,
 	 	       unsigned int tag1, unsigned int len)
 {
 	unsigned int wrlen;
 	struct mbuf *m;
 	struct work_request_hdr *wr;
 	struct cpl_get_tcb *getreq;
 	struct cpl_set_tcb_field *req;
 	struct ddp_state *p = &toep->tp_ddp_state;
 
 	CTR4(KTR_TCB, "t3_setup_ppods(bufidx=%u tag0=%u tag1=%u len=%u)",
 	    bufidx, tag0, tag1, len);
 #if 0
 	SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv);
 #endif	
 	wrlen = sizeof(*wr) + 3 * sizeof(*req) + sizeof(*getreq);
 	m = m_gethdr_nofail(wrlen);
 	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
 	wr = mtod(m, struct work_request_hdr *);
 	m->m_pkthdr.len = m->m_len = wrlen;
 	bzero(wr, wrlen);
 
 	
 	/* Set the ATOMIC flag to make sure that TP processes the following
 	 * CPLs in an atomic manner and no wire segments can be interleaved.
 	 */
 	wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS) | F_WR_ATOMIC);
 	req = (struct cpl_set_tcb_field *)(wr + 1);
 	mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_TAG,
 			     V_TCB_RX_DDP_BUF0_TAG(M_TCB_RX_DDP_BUF0_TAG) |
 			     V_TCB_RX_DDP_BUF1_TAG(M_TCB_RX_DDP_BUF1_TAG) << 32,
 			     V_TCB_RX_DDP_BUF0_TAG(tag0) |
 			     V_TCB_RX_DDP_BUF1_TAG((uint64_t)tag1) << 32);
 	req++;
 	if (bufidx == 0) {
 		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_LEN,
 			    V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN),
 			    V_TCB_RX_DDP_BUF0_LEN((uint64_t)len));
 		req++;
 		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
 			    V_TF_DDP_PUSH_DISABLE_0(1) |
 			    V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(1),
 			    V_TF_DDP_PUSH_DISABLE_0(0) |
 			    V_TF_DDP_BUF0_VALID(1));
 	} else {
 		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF1_LEN,
 			    V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN),
 			    V_TCB_RX_DDP_BUF1_LEN((uint64_t)len));
 		req++;
 		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
 			    V_TF_DDP_PUSH_DISABLE_1(1) |
 			    V_TF_DDP_BUF1_VALID(1) | V_TF_DDP_ACTIVE_BUF(1),
 			    V_TF_DDP_PUSH_DISABLE_1(0) |
 			    V_TF_DDP_BUF1_VALID(1) | V_TF_DDP_ACTIVE_BUF(1));
 	}
 
 	getreq = (struct cpl_get_tcb *)(req + 1);
 	mk_get_tcb_ulp(getreq, toep->tp_tid, toep->tp_qset);
 
 	/* Keep track of the number of oustanding CPL_GET_TCB requests
 	 */
 	p->get_tcb_count++;
 
 #ifdef T3_TRACE
 	T3_TRACE4(TIDTB(sk),
 		  "t3_overlay_ddpbuf: bufidx %u tag0 %u tag1 %u "
 		  "len %d",
 		  bufidx, tag0, tag1, len);
 #endif
 	cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
 }
 
 /*
  * Sends a compound WR containing all the CPL messages needed to program the
  * two HW DDP buffers, namely optionally setting up the length and offset of
  * each buffer, programming the DDP flags, and optionally sending RX_DATA_ACK.
  */
 void
 t3_setup_ddpbufs(struct toepcb *toep, unsigned int len0, unsigned int offset0,
 		      unsigned int len1, unsigned int offset1,
                       uint64_t ddp_flags, uint64_t flag_mask, int modulate)
 {
 	unsigned int wrlen;
 	struct mbuf *m;
 	struct work_request_hdr *wr;
 	struct cpl_set_tcb_field *req;
 
 	CTR6(KTR_TCB, "t3_setup_ddpbufs(len0=%u offset0=%u len1=%u offset1=%u ddp_flags=0x%08x%08x ",
 	    len0, offset0, len1, offset1, ddp_flags >> 32, ddp_flags & 0xffffffff);
 	
 #if 0
 	SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv);
 #endif
 	wrlen = sizeof(*wr) + sizeof(*req) + (len0 ? sizeof(*req) : 0) +
 		(len1 ? sizeof(*req) : 0) +
 		(modulate ? sizeof(struct cpl_rx_data_ack) : 0);
 	m = m_gethdr_nofail(wrlen);
 	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
 	wr = mtod(m, struct work_request_hdr *);
 	bzero(wr, wrlen);
 	
 	wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS));
 	m->m_pkthdr.len = m->m_len = wrlen;
 
 	req = (struct cpl_set_tcb_field *)(wr + 1);
 	if (len0) {                  /* program buffer 0 offset and length */
 		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_OFFSET,
 			V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |
 			V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN),
 			V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)offset0) |
 			V_TCB_RX_DDP_BUF0_LEN((uint64_t)len0));
 		req++;
 	}
 	if (len1) {                  /* program buffer 1 offset and length */
 		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF1_OFFSET,
 			V_TCB_RX_DDP_BUF1_OFFSET(M_TCB_RX_DDP_BUF1_OFFSET) |
 			V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN) << 32,
 			V_TCB_RX_DDP_BUF1_OFFSET((uint64_t)offset1) |
 			V_TCB_RX_DDP_BUF1_LEN((uint64_t)len1) << 32);
 		req++;
 	}
 
 	mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS, flag_mask,
 			     ddp_flags);
 
 	if (modulate) {
 		mk_rx_data_ack_ulp(toep,
 		    (struct cpl_rx_data_ack *)(req + 1), toep->tp_tid,
 		    toep->tp_copied_seq - toep->tp_rcv_wup);
 		toep->tp_rcv_wup = toep->tp_copied_seq;
 	}
 
 #ifdef T3_TRACE
 	T3_TRACE5(TIDTB(sk),
 		  "t3_setup_ddpbufs: len0 %u len1 %u ddp_flags 0x%08x%08x "
 		  "modulate %d",
 		  len0, len1, ddp_flags >> 32, ddp_flags & 0xffffffff,
 		  modulate);
 #endif
 
 	cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
 }
 
 void
 t3_init_wr_tab(unsigned int wr_len)
 {
 	int i;
 
 	if (mbuf_wrs[1])     /* already initialized */
 		return;
 
 	for (i = 1; i < ARRAY_SIZE(mbuf_wrs); i++) {
 		int sgl_len = (3 * i) / 2 + (i & 1);
 
 		sgl_len += 3;
 		mbuf_wrs[i] = sgl_len <= wr_len ?
 		       	1 : 1 + (sgl_len - 2) / (wr_len - 1);
 	}
 
 	wrlen = wr_len * 8;
 }
 
 int
 t3_init_cpl_io(void)
 {
 #ifdef notyet
 	tcphdr_skb = alloc_skb(sizeof(struct tcphdr), GFP_KERNEL);
 	if (!tcphdr_skb) {
 		log(LOG_ERR,
 		       "Chelsio TCP offload: can't allocate sk_buff\n");
 		return -1;
 	}
 	skb_put(tcphdr_skb, sizeof(struct tcphdr));
 	tcphdr_skb->h.raw = tcphdr_skb->data;
 	memset(tcphdr_skb->data, 0, tcphdr_skb->len);
 #endif
 	
 	t3tom_register_cpl_handler(CPL_ACT_ESTABLISH, do_act_establish);
 	t3tom_register_cpl_handler(CPL_ACT_OPEN_RPL, do_act_open_rpl);
 	t3tom_register_cpl_handler(CPL_TX_DMA_ACK, do_wr_ack);
 	t3tom_register_cpl_handler(CPL_RX_DATA, do_rx_data);
 	t3tom_register_cpl_handler(CPL_CLOSE_CON_RPL, do_close_con_rpl);
 	t3tom_register_cpl_handler(CPL_PEER_CLOSE, do_peer_close);
 	t3tom_register_cpl_handler(CPL_PASS_ESTABLISH, do_pass_establish);
 	t3tom_register_cpl_handler(CPL_PASS_ACCEPT_REQ, do_pass_accept_req);
 	t3tom_register_cpl_handler(CPL_ABORT_REQ_RSS, do_abort_req);
 	t3tom_register_cpl_handler(CPL_ABORT_RPL_RSS, do_abort_rpl);
 	t3tom_register_cpl_handler(CPL_RX_DATA_DDP, do_rx_data_ddp);
 	t3tom_register_cpl_handler(CPL_RX_DDP_COMPLETE, do_rx_ddp_complete);
 	t3tom_register_cpl_handler(CPL_RX_URG_NOTIFY, do_rx_urg_notify);
 	t3tom_register_cpl_handler(CPL_TRACE_PKT, do_trace_pkt);
 	t3tom_register_cpl_handler(CPL_GET_TCB_RPL, do_get_tcb_rpl);
 	return (0);
 }
 
Index: head/sys/netinet/in_pcb.c
===================================================================
--- head/sys/netinet/in_pcb.c	(revision 186221)
+++ head/sys/netinet/in_pcb.c	(revision 186222)
@@ -1,1927 +1,1927 @@
 /*-
  * Copyright (c) 1982, 1986, 1991, 1993, 1995
  *	The Regents of the University of California.
  * Copyright (c) 2007-2008 Robert N. M. Watson
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)in_pcb.c	8.4 (Berkeley) 5/24/95
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ddb.h"
 #include "opt_ipsec.h"
 #include "opt_inet6.h"
 #include "opt_mac.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/domain.h>
 #include <sys/protosw.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/sysctl.h>
 #include <sys/vimage.h>
 
 #ifdef DDB
 #include <ddb/ddb.h>
 #endif
 
 #include <vm/uma.h>
 
 #include <net/if.h>
 #include <net/if_types.h>
 #include <net/route.h>
 
 #include <netinet/in.h>
 #include <netinet/in_pcb.h>
 #include <netinet/in_var.h>
 #include <netinet/ip_var.h>
 #include <netinet/tcp_var.h>
 #include <netinet/udp.h>
 #include <netinet/udp_var.h>
 #include <netinet/vinet.h>
 #ifdef INET6
 #include <netinet/ip6.h>
 #include <netinet6/ip6_var.h>
 #include <netinet6/vinet6.h>
 #endif /* INET6 */
 
 
 #ifdef IPSEC
 #include <netipsec/ipsec.h>
 #include <netipsec/key.h>
 #endif /* IPSEC */
 
 #include <security/mac/mac_framework.h>
 
 #ifdef VIMAGE_GLOBALS
 /*
  * These configure the range of local port addresses assigned to
  * "unspecified" outgoing connections/packets/whatever.
  */
 int	ipport_lowfirstauto;
 int	ipport_lowlastauto;
 int	ipport_firstauto;
 int	ipport_lastauto;
 int	ipport_hifirstauto;
 int	ipport_hilastauto;
 
 /*
  * Reserved ports accessible only to root. There are significant
  * security considerations that must be accounted for when changing these,
  * but the security benefits can be great. Please be careful.
  */
 int	ipport_reservedhigh;
 int	ipport_reservedlow;
 
 /* Variables dealing with random ephemeral port allocation. */
 int	ipport_randomized;
 int	ipport_randomcps;
 int	ipport_randomtime;
 int	ipport_stoprandom;
 int	ipport_tcpallocs;
 int	ipport_tcplastcount;
 #endif
 
 #define RANGECHK(var, min, max) \
 	if ((var) < (min)) { (var) = (min); } \
 	else if ((var) > (max)) { (var) = (max); }
 
 static int
 sysctl_net_ipport_check(SYSCTL_HANDLER_ARGS)
 {
 	INIT_VNET_INET(curvnet);
 	int error;
 
 	error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2, req);
 	if (error == 0) {
 		RANGECHK(V_ipport_lowfirstauto, 1, IPPORT_RESERVED - 1);
 		RANGECHK(V_ipport_lowlastauto, 1, IPPORT_RESERVED - 1);
 		RANGECHK(V_ipport_firstauto, IPPORT_RESERVED, IPPORT_MAX);
 		RANGECHK(V_ipport_lastauto, IPPORT_RESERVED, IPPORT_MAX);
 		RANGECHK(V_ipport_hifirstauto, IPPORT_RESERVED, IPPORT_MAX);
 		RANGECHK(V_ipport_hilastauto, IPPORT_RESERVED, IPPORT_MAX);
 	}
 	return (error);
 }
 
 #undef RANGECHK
 
 SYSCTL_NODE(_net_inet_ip, IPPROTO_IP, portrange, CTLFLAG_RW, 0, "IP Ports");
 
 SYSCTL_V_PROC(V_NET, vnet_inet, _net_inet_ip_portrange, OID_AUTO,
 	lowfirst, CTLTYPE_INT|CTLFLAG_RW, ipport_lowfirstauto, 0,
 	&sysctl_net_ipport_check, "I", "");
 SYSCTL_V_PROC(V_NET, vnet_inet, _net_inet_ip_portrange, OID_AUTO,
 	lowlast, CTLTYPE_INT|CTLFLAG_RW, ipport_lowlastauto, 0,
 	&sysctl_net_ipport_check, "I", "");
 SYSCTL_V_PROC(V_NET, vnet_inet, _net_inet_ip_portrange, OID_AUTO,
 	first, CTLTYPE_INT|CTLFLAG_RW, ipport_firstauto, 0,
 	&sysctl_net_ipport_check, "I", "");
 SYSCTL_V_PROC(V_NET, vnet_inet, _net_inet_ip_portrange, OID_AUTO,
 	last, CTLTYPE_INT|CTLFLAG_RW, ipport_lastauto, 0,
 	&sysctl_net_ipport_check, "I", "");
 SYSCTL_V_PROC(V_NET, vnet_inet, _net_inet_ip_portrange, OID_AUTO,
 	hifirst, CTLTYPE_INT|CTLFLAG_RW, ipport_hifirstauto, 0,	
 	&sysctl_net_ipport_check, "I", "");
 SYSCTL_V_PROC(V_NET, vnet_inet, _net_inet_ip_portrange, OID_AUTO,
 	hilast, CTLTYPE_INT|CTLFLAG_RW, ipport_hilastauto, 0,
 	&sysctl_net_ipport_check, "I", "");
 SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip_portrange, OID_AUTO,
 	reservedhigh, CTLFLAG_RW|CTLFLAG_SECURE, ipport_reservedhigh, 0, "");
 SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip_portrange, OID_AUTO, reservedlow,
 	CTLFLAG_RW|CTLFLAG_SECURE, ipport_reservedlow, 0, "");
 SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip_portrange, OID_AUTO, randomized,
 	CTLFLAG_RW, ipport_randomized, 0, "Enable random port allocation");
 SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip_portrange, OID_AUTO, randomcps,
 	CTLFLAG_RW, ipport_randomcps, 0, "Maximum number of random port "
 	"allocations before switching to a sequental one");
 SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip_portrange, OID_AUTO, randomtime,
 	CTLFLAG_RW, ipport_randomtime, 0,
 	"Minimum time to keep sequental port "
 	"allocation before switching to a random one");
 
 /*
  * in_pcb.c: manage the Protocol Control Blocks.
  *
  * NOTE: It is assumed that most of these functions will be called with
  * the pcbinfo lock held, and often, the inpcb lock held, as these utility
  * functions often modify hash chains or addresses in pcbs.
  */
 
 /*
  * Allocate a PCB and associate it with the socket.
  * On success return with the PCB locked.
  */
 int
 in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo)
 {
 #ifdef INET6
 	INIT_VNET_INET6(curvnet);
 #endif
 	struct inpcb *inp;
 	int error;
 
 	INP_INFO_WLOCK_ASSERT(pcbinfo);
 	error = 0;
 	inp = uma_zalloc(pcbinfo->ipi_zone, M_NOWAIT);
 	if (inp == NULL)
 		return (ENOBUFS);
 	bzero(inp, inp_zero_size);
 	inp->inp_pcbinfo = pcbinfo;
 	inp->inp_socket = so;
 	inp->inp_cred = crhold(so->so_cred);
 	inp->inp_inc.inc_fibnum = so->so_fibnum;
 #ifdef MAC
 	error = mac_inpcb_init(inp, M_NOWAIT);
 	if (error != 0)
 		goto out;
 	SOCK_LOCK(so);
 	mac_inpcb_create(so, inp);
 	SOCK_UNLOCK(so);
 #endif
 #ifdef IPSEC
 	error = ipsec_init_policy(so, &inp->inp_sp);
 	if (error != 0) {
 #ifdef MAC
 		mac_inpcb_destroy(inp);
 #endif
 		goto out;
 	}
 #endif /*IPSEC*/
 #ifdef INET6
 	if (INP_SOCKAF(so) == AF_INET6) {
 		inp->inp_vflag |= INP_IPV6PROTO;
 		if (V_ip6_v6only)
 			inp->inp_flags |= IN6P_IPV6_V6ONLY;
 	}
 #endif
 	LIST_INSERT_HEAD(pcbinfo->ipi_listhead, inp, inp_list);
 	pcbinfo->ipi_count++;
 	so->so_pcb = (caddr_t)inp;
 #ifdef INET6
 	if (V_ip6_auto_flowlabel)
 		inp->inp_flags |= IN6P_AUTOFLOWLABEL;
 #endif
 	INP_WLOCK(inp);
 	inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
 	inp->inp_refcount = 1;	/* Reference from the inpcbinfo */
 #if defined(IPSEC) || defined(MAC)
 out:
 	if (error != 0) {
 		crfree(inp->inp_cred);
 		uma_zfree(pcbinfo->ipi_zone, inp);
 	}
 #endif
 	return (error);
 }
 
 int
 in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred)
 {
 	int anonport, error;
 
 	INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo);
 	INP_WLOCK_ASSERT(inp);
 
 	if (inp->inp_lport != 0 || inp->inp_laddr.s_addr != INADDR_ANY)
 		return (EINVAL);
 	anonport = inp->inp_lport == 0 && (nam == NULL ||
 	    ((struct sockaddr_in *)nam)->sin_port == 0);
 	error = in_pcbbind_setup(inp, nam, &inp->inp_laddr.s_addr,
 	    &inp->inp_lport, cred);
 	if (error)
 		return (error);
 	if (in_pcbinshash(inp) != 0) {
 		inp->inp_laddr.s_addr = INADDR_ANY;
 		inp->inp_lport = 0;
 		return (EAGAIN);
 	}
 	if (anonport)
 		inp->inp_flags |= INP_ANONPORT;
 	return (0);
 }
 
 /*
  * Set up a bind operation on a PCB, performing port allocation
  * as required, but do not actually modify the PCB. Callers can
  * either complete the bind by setting inp_laddr/inp_lport and
  * calling in_pcbinshash(), or they can just use the resulting
  * port and address to authorise the sending of a once-off packet.
  *
  * On error, the values of *laddrp and *lportp are not changed.
  */
 int
 in_pcbbind_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp,
     u_short *lportp, struct ucred *cred)
 {
 	INIT_VNET_INET(inp->inp_vnet);
 	struct socket *so = inp->inp_socket;
 	unsigned short *lastport;
 	struct sockaddr_in *sin;
 	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
 	struct in_addr laddr;
 	u_short lport = 0;
 	int wild = 0, reuseport = (so->so_options & SO_REUSEPORT);
 	int error;
 	int dorandom;
 
 	/*
 	 * Because no actual state changes occur here, a global write lock on
 	 * the pcbinfo isn't required.
 	 */
 	INP_INFO_LOCK_ASSERT(pcbinfo);
 	INP_LOCK_ASSERT(inp);
 
 	if (TAILQ_EMPTY(&V_in_ifaddrhead)) /* XXX broken! */
 		return (EADDRNOTAVAIL);
 	laddr.s_addr = *laddrp;
 	if (nam != NULL && laddr.s_addr != INADDR_ANY)
 		return (EINVAL);
 	if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) == 0)
 		wild = INPLOOKUP_WILDCARD;
 	if (nam) {
 		sin = (struct sockaddr_in *)nam;
 		if (nam->sa_len != sizeof (*sin))
 			return (EINVAL);
 #ifdef notdef
 		/*
 		 * We should check the family, but old programs
 		 * incorrectly fail to initialize it.
 		 */
 		if (sin->sin_family != AF_INET)
 			return (EAFNOSUPPORT);
 #endif
 		if (prison_local_ip4(cred, &sin->sin_addr))
 			return (EINVAL);
 		if (sin->sin_port != *lportp) {
 			/* Don't allow the port to change. */
 			if (*lportp != 0)
 				return (EINVAL);
 			lport = sin->sin_port;
 		}
 		/* NB: lport is left as 0 if the port isn't being changed. */
 		if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) {
 			/*
 			 * Treat SO_REUSEADDR as SO_REUSEPORT for multicast;
 			 * allow complete duplication of binding if
 			 * SO_REUSEPORT is set, or if SO_REUSEADDR is set
 			 * and a multicast address is bound on both
 			 * new and duplicated sockets.
 			 */
 			if (so->so_options & SO_REUSEADDR)
 				reuseport = SO_REUSEADDR|SO_REUSEPORT;
 		} else if (sin->sin_addr.s_addr != INADDR_ANY) {
 			sin->sin_port = 0;		/* yech... */
 			bzero(&sin->sin_zero, sizeof(sin->sin_zero));
 			if (ifa_ifwithaddr((struct sockaddr *)sin) == 0)
 				return (EADDRNOTAVAIL);
 		}
 		laddr = sin->sin_addr;
 		if (lport) {
 			struct inpcb *t;
 			struct tcptw *tw;
 
 			/* GROSS */
 			if (ntohs(lport) <= V_ipport_reservedhigh &&
 			    ntohs(lport) >= V_ipport_reservedlow &&
 			    priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT,
 			    0))
 				return (EACCES);
 			if (!IN_MULTICAST(ntohl(sin->sin_addr.s_addr)) &&
 			    priv_check_cred(inp->inp_cred,
 			    PRIV_NETINET_REUSEPORT, 0) != 0) {
 				t = in_pcblookup_local(pcbinfo, sin->sin_addr,
 				    lport, INPLOOKUP_WILDCARD, cred);
 	/*
 	 * XXX
 	 * This entire block sorely needs a rewrite.
 	 */
 				if (t &&
 				    ((t->inp_vflag & INP_TIMEWAIT) == 0) &&
 				    (so->so_type != SOCK_STREAM ||
 				     ntohl(t->inp_faddr.s_addr) == INADDR_ANY) &&
 				    (ntohl(sin->sin_addr.s_addr) != INADDR_ANY ||
 				     ntohl(t->inp_laddr.s_addr) != INADDR_ANY ||
 				     (t->inp_socket->so_options &
 					 SO_REUSEPORT) == 0) &&
 				    (inp->inp_cred->cr_uid !=
 				     t->inp_cred->cr_uid))
 					return (EADDRINUSE);
 			}
 			if (prison_local_ip4(cred, &sin->sin_addr))
 				return (EADDRNOTAVAIL);
 			t = in_pcblookup_local(pcbinfo, sin->sin_addr,
 			    lport, wild, cred);
 			if (t && (t->inp_vflag & INP_TIMEWAIT)) {
 				/*
 				 * XXXRW: If an incpb has had its timewait
 				 * state recycled, we treat the address as
 				 * being in use (for now).  This is better
 				 * than a panic, but not desirable.
 				 */
 				tw = intotw(inp);
 				if (tw == NULL ||
 				    (reuseport & tw->tw_so_options) == 0)
 					return (EADDRINUSE);
 			} else if (t &&
 			    (reuseport & t->inp_socket->so_options) == 0) {
 #ifdef INET6
 				if (ntohl(sin->sin_addr.s_addr) !=
 				    INADDR_ANY ||
 				    ntohl(t->inp_laddr.s_addr) !=
 				    INADDR_ANY ||
 				    INP_SOCKAF(so) ==
 				    INP_SOCKAF(t->inp_socket))
 #endif
 				return (EADDRINUSE);
 			}
 		}
 	}
 	if (*lportp != 0)
 		lport = *lportp;
 	if (lport == 0) {
 		u_short first, last, aux;
 		int count;
 
 		if (prison_local_ip4(cred, &laddr))
 			return (EINVAL);
 
 		if (inp->inp_flags & INP_HIGHPORT) {
 			first = V_ipport_hifirstauto;	/* sysctl */
 			last  = V_ipport_hilastauto;
 			lastport = &pcbinfo->ipi_lasthi;
 		} else if (inp->inp_flags & INP_LOWPORT) {
 			error = priv_check_cred(cred,
 			    PRIV_NETINET_RESERVEDPORT, 0);
 			if (error)
 				return error;
 			first = V_ipport_lowfirstauto;	/* 1023 */
 			last  = V_ipport_lowlastauto;	/* 600 */
 			lastport = &pcbinfo->ipi_lastlow;
 		} else {
 			first = V_ipport_firstauto;	/* sysctl */
 			last  = V_ipport_lastauto;
 			lastport = &pcbinfo->ipi_lastport;
 		}
 		/*
 		 * For UDP, use random port allocation as long as the user
 		 * allows it.  For TCP (and as of yet unknown) connections,
 		 * use random port allocation only if the user allows it AND
 		 * ipport_tick() allows it.
 		 */
 		if (V_ipport_randomized &&
 			(!V_ipport_stoprandom || pcbinfo == &V_udbinfo))
 			dorandom = 1;
 		else
 			dorandom = 0;
 		/*
 		 * It makes no sense to do random port allocation if
 		 * we have the only port available.
 		 */
 		if (first == last)
 			dorandom = 0;
 		/* Make sure to not include UDP packets in the count. */
 		if (pcbinfo != &V_udbinfo)
 			V_ipport_tcpallocs++;
 		/*
 		 * Instead of having two loops further down counting up or down
 		 * make sure that first is always <= last and go with only one
 		 * code path implementing all logic.
 		 */
 		if (first > last) {
 			aux = first;
 			first = last;
 			last = aux;
 		}
 
 		if (dorandom)
 			*lastport = first +
 				    (arc4random() % (last - first));
 
 		count = last - first;
 
 		do {
 			if (count-- < 0)	/* completely used? */
 				return (EADDRNOTAVAIL);
 			++*lastport;
 			if (*lastport < first || *lastport > last)
 				*lastport = first;
 			lport = htons(*lastport);
 		} while (in_pcblookup_local(pcbinfo, laddr,
 		    lport, wild, cred));
 	}
 	if (prison_local_ip4(cred, &laddr))
 		return (EINVAL);
 	*laddrp = laddr.s_addr;
 	*lportp = lport;
 	return (0);
 }
 
 /*
  * Connect from a socket to a specified address.
  * Both address and port must be specified in argument sin.
  * If don't have a local address for this socket yet,
  * then pick one.
  */
 int
 in_pcbconnect(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred)
 {
 	u_short lport, fport;
 	in_addr_t laddr, faddr;
 	int anonport, error;
 
 	INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo);
 	INP_WLOCK_ASSERT(inp);
 
 	lport = inp->inp_lport;
 	laddr = inp->inp_laddr.s_addr;
 	anonport = (lport == 0);
 	error = in_pcbconnect_setup(inp, nam, &laddr, &lport, &faddr, &fport,
 	    NULL, cred);
 	if (error)
 		return (error);
 
 	/* Do the initial binding of the local address if required. */
 	if (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0) {
 		inp->inp_lport = lport;
 		inp->inp_laddr.s_addr = laddr;
 		if (in_pcbinshash(inp) != 0) {
 			inp->inp_laddr.s_addr = INADDR_ANY;
 			inp->inp_lport = 0;
 			return (EAGAIN);
 		}
 	}
 
 	/* Commit the remaining changes. */
 	inp->inp_lport = lport;
 	inp->inp_laddr.s_addr = laddr;
 	inp->inp_faddr.s_addr = faddr;
 	inp->inp_fport = fport;
 	in_pcbrehash(inp);
 
 	if (anonport)
 		inp->inp_flags |= INP_ANONPORT;
 	return (0);
 }
 
 /*
  * Do proper source address selection on an unbound socket in case
  * of connect. Take jails into account as well.
  */
 static int
 in_pcbladdr(struct inpcb *inp, struct in_addr *faddr, struct in_addr *laddr,
     struct ucred *cred)
 {
 	struct in_ifaddr *ia;
 	struct ifaddr *ifa;
 	struct sockaddr *sa;
 	struct sockaddr_in *sin;
 	struct route sro;
 	int error;
 
 	KASSERT(laddr != NULL, ("%s: laddr NULL", __func__));
 
 	error = 0;
 	ia = NULL;
 	bzero(&sro, sizeof(sro));
 
 	sin = (struct sockaddr_in *)&sro.ro_dst;
 	sin->sin_family = AF_INET;
 	sin->sin_len = sizeof(struct sockaddr_in);
 	sin->sin_addr.s_addr = faddr->s_addr;
 
 	/*
 	 * If route is known our src addr is taken from the i/f,
 	 * else punt.
 	 *
 	 * Find out route to destination.
 	 */
 	if ((inp->inp_socket->so_options & SO_DONTROUTE) == 0)
 		in_rtalloc_ign(&sro, 0, inp->inp_inc.inc_fibnum);
 
 	/*
 	 * If we found a route, use the address corresponding to
 	 * the outgoing interface.
 	 * 
 	 * Otherwise assume faddr is reachable on a directly connected
 	 * network and try to find a corresponding interface to take
 	 * the source address from.
 	 */
 	if (sro.ro_rt == NULL || sro.ro_rt->rt_ifp == NULL) {
 		struct ifnet *ifp;
 
 		ia = ifatoia(ifa_ifwithdstaddr((struct sockaddr *)sin));
 		if (ia == NULL)
 			ia = ifatoia(ifa_ifwithnet((struct sockaddr *)sin));
 		if (ia == NULL) {
 			error = ENETUNREACH;
 			goto done;
 		}
 
 		if (cred == NULL || !jailed(cred)) {
 			laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
 			goto done;
 		}
 
 		ifp = ia->ia_ifp;
 		ia = NULL;
 		TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 
 			sa = ifa->ifa_addr;
 			if (sa->sa_family != AF_INET)
 				continue;
 			sin = (struct sockaddr_in *)sa;
 			if (prison_check_ip4(cred, &sin->sin_addr)) {
 				ia = (struct in_ifaddr *)ifa;
 				break;
 			}
 		}
 		if (ia != NULL) {
 			laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
 			goto done;
 		}
 
 		/* 3. As a last resort return the 'default' jail address. */
 		if (prison_getip4(cred, laddr) != 0)
 			error = EADDRNOTAVAIL;
 		goto done;
 	}
 
 	/*
 	 * If the outgoing interface on the route found is not
 	 * a loopback interface, use the address from that interface.
 	 * In case of jails do those three steps:
 	 * 1. check if the interface address belongs to the jail. If so use it.
 	 * 2. check if we have any address on the outgoing interface
 	 *    belonging to this jail. If so use it.
 	 * 3. as a last resort return the 'default' jail address.
 	 */
 	if ((sro.ro_rt->rt_ifp->if_flags & IFF_LOOPBACK) == 0) {
 
 		/* If not jailed, use the default returned. */
 		if (cred == NULL || !jailed(cred)) {
 			ia = (struct in_ifaddr *)sro.ro_rt->rt_ifa;
 			laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
 			goto done;
 		}
 
 		/* Jailed. */
 		/* 1. Check if the iface address belongs to the jail. */
 		sin = (struct sockaddr_in *)sro.ro_rt->rt_ifa->ifa_addr;
 		if (prison_check_ip4(cred, &sin->sin_addr)) {
 			ia = (struct in_ifaddr *)sro.ro_rt->rt_ifa;
 			laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
 			goto done;
 		}
 
 		/*
 		 * 2. Check if we have any address on the outgoing interface
 		 *    belonging to this jail.
 		 */
 		TAILQ_FOREACH(ifa, &sro.ro_rt->rt_ifp->if_addrhead, ifa_link) {
 
 			sa = ifa->ifa_addr;
 			if (sa->sa_family != AF_INET)
 				continue;
 			sin = (struct sockaddr_in *)sa;
 			if (prison_check_ip4(cred, &sin->sin_addr)) {
 				ia = (struct in_ifaddr *)ifa;
 				break;
 			}
 		}
 		if (ia != NULL) {
 			laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
 			goto done;
 		}
 
 		/* 3. As a last resort return the 'default' jail address. */
 		if (prison_getip4(cred, laddr) != 0)
 			error = EADDRNOTAVAIL;
 		goto done;
 	}
 
 	/*
 	 * The outgoing interface is marked with 'loopback net', so a route
 	 * to ourselves is here.
 	 * Try to find the interface of the destination address and then
 	 * take the address from there. That interface is not necessarily
 	 * a loopback interface.
 	 * In case of jails, check that it is an address of the jail
 	 * and if we cannot find, fall back to the 'default' jail address.
 	 */
 	if ((sro.ro_rt->rt_ifp->if_flags & IFF_LOOPBACK) != 0) {
 		struct sockaddr_in sain;
 
 		bzero(&sain, sizeof(struct sockaddr_in));
 		sain.sin_family = AF_INET;
 		sain.sin_len = sizeof(struct sockaddr_in);
 		sain.sin_addr.s_addr = faddr->s_addr;
 
 		ia = ifatoia(ifa_ifwithdstaddr(sintosa(&sain)));
 		if (ia == NULL)
 			ia = ifatoia(ifa_ifwithnet(sintosa(&sain)));
 
 		if (cred == NULL || !jailed(cred)) {
 #if __FreeBSD_version < 800000
 			if (ia == NULL)
 				ia = (struct in_ifaddr *)sro.ro_rt->rt_ifa;
 #endif
 			if (ia == NULL) {
 				error = ENETUNREACH;
 				goto done;
 			}
 			laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
 			goto done;
 		}
 
 		/* Jailed. */
 		if (ia != NULL) {
 			struct ifnet *ifp;
 
 			ifp = ia->ia_ifp;
 			ia = NULL;
 			TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 
 				sa = ifa->ifa_addr;
 				if (sa->sa_family != AF_INET)
 					continue;
 				sin = (struct sockaddr_in *)sa;
 				if (prison_check_ip4(cred, &sin->sin_addr)) {
 					ia = (struct in_ifaddr *)ifa;
 					break;
 				}
 			}
 			if (ia != NULL) {
 				laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
 				goto done;
 			}
 		}
 
 		/* 3. As a last resort return the 'default' jail address. */
 		if (prison_getip4(cred, laddr) != 0)
 			error = EADDRNOTAVAIL;
 		goto done;
 	}
 
 done:
 	if (sro.ro_rt != NULL)
 		RTFREE(sro.ro_rt);
 	return (error);
 }
 
 /*
  * Set up for a connect from a socket to the specified address.
  * On entry, *laddrp and *lportp should contain the current local
  * address and port for the PCB; these are updated to the values
  * that should be placed in inp_laddr and inp_lport to complete
  * the connect.
  *
  * On success, *faddrp and *fportp will be set to the remote address
  * and port. These are not updated in the error case.
  *
  * If the operation fails because the connection already exists,
  * *oinpp will be set to the PCB of that connection so that the
  * caller can decide to override it. In all other cases, *oinpp
  * is set to NULL.
  */
 int
 in_pcbconnect_setup(struct inpcb *inp, struct sockaddr *nam,
     in_addr_t *laddrp, u_short *lportp, in_addr_t *faddrp, u_short *fportp,
     struct inpcb **oinpp, struct ucred *cred)
 {
 	INIT_VNET_INET(inp->inp_vnet);
 	struct sockaddr_in *sin = (struct sockaddr_in *)nam;
 	struct in_ifaddr *ia;
 	struct inpcb *oinp;
 	struct in_addr laddr, faddr, jailia;
 	u_short lport, fport;
 	int error;
 
 	/*
 	 * Because a global state change doesn't actually occur here, a read
 	 * lock is sufficient.
 	 */
 	INP_INFO_LOCK_ASSERT(inp->inp_pcbinfo);
 	INP_LOCK_ASSERT(inp);
 
 	if (oinpp != NULL)
 		*oinpp = NULL;
 	if (nam->sa_len != sizeof (*sin))
 		return (EINVAL);
 	if (sin->sin_family != AF_INET)
 		return (EAFNOSUPPORT);
 	if (sin->sin_port == 0)
 		return (EADDRNOTAVAIL);
 	laddr.s_addr = *laddrp;
 	lport = *lportp;
 	faddr = sin->sin_addr;
 	fport = sin->sin_port;
 
 	if (!TAILQ_EMPTY(&V_in_ifaddrhead)) {
 		/*
 		 * If the destination address is INADDR_ANY,
 		 * use the primary local address.
 		 * If the supplied address is INADDR_BROADCAST,
 		 * and the primary interface supports broadcast,
 		 * choose the broadcast address for that interface.
 		 */
 		if (faddr.s_addr == INADDR_ANY) {
 			if (cred != NULL && jailed(cred)) {
 				if (prison_getip4(cred, &jailia) != 0)
 					return (EADDRNOTAVAIL);
 				faddr.s_addr = jailia.s_addr;
 			} else {
 				faddr =
 				    IA_SIN(TAILQ_FIRST(&V_in_ifaddrhead))->
 				    sin_addr;
 			}
 		} else if (faddr.s_addr == (u_long)INADDR_BROADCAST &&
 		    (TAILQ_FIRST(&V_in_ifaddrhead)->ia_ifp->if_flags &
 		    IFF_BROADCAST))
 			faddr = satosin(&TAILQ_FIRST(
 			    &V_in_ifaddrhead)->ia_broadaddr)->sin_addr;
 	}
 	if (laddr.s_addr == INADDR_ANY) {
 		error = in_pcbladdr(inp, &faddr, &laddr, cred);
 		if (error)
 			return (error);
 
 		/*
 		 * If the destination address is multicast and an outgoing
 		 * interface has been set as a multicast option, use the
 		 * address of that interface as our source address.
 		 */
 		if (IN_MULTICAST(ntohl(faddr.s_addr)) &&
 		    inp->inp_moptions != NULL) {
 			struct ip_moptions *imo;
 			struct ifnet *ifp;
 
 			imo = inp->inp_moptions;
 			if (imo->imo_multicast_ifp != NULL) {
 				ifp = imo->imo_multicast_ifp;
 				TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link)
 					if (ia->ia_ifp == ifp)
 						break;
 				if (ia == NULL)
 					return (EADDRNOTAVAIL);
 				laddr = ia->ia_addr.sin_addr;
 			}
 		}
 	}
 
 	oinp = in_pcblookup_hash(inp->inp_pcbinfo, faddr, fport, laddr, lport,
 	    0, NULL);
 	if (oinp != NULL) {
 		if (oinpp != NULL)
 			*oinpp = oinp;
 		return (EADDRINUSE);
 	}
 	if (lport == 0) {
 		error = in_pcbbind_setup(inp, NULL, &laddr.s_addr, &lport,
 		    cred);
 		if (error)
 			return (error);
 	}
 	*laddrp = laddr.s_addr;
 	*lportp = lport;
 	*faddrp = faddr.s_addr;
 	*fportp = fport;
 	return (0);
 }
 
 void
 in_pcbdisconnect(struct inpcb *inp)
 {
 
 	INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo);
 	INP_WLOCK_ASSERT(inp);
 
 	inp->inp_faddr.s_addr = INADDR_ANY;
 	inp->inp_fport = 0;
 	in_pcbrehash(inp);
 }
 
 /*
  * in_pcbdetach() is responsibe for disassociating a socket from an inpcb.
  * For most protocols, this will be invoked immediately prior to calling
  * in_pcbfree().  However, with TCP the inpcb may significantly outlive the
  * socket, in which case in_pcbfree() is deferred.
  */
 void
 in_pcbdetach(struct inpcb *inp)
 {
 
 	KASSERT(inp->inp_socket != NULL, ("%s: inp_socket == NULL", __func__));
 
 	inp->inp_socket->so_pcb = NULL;
 	inp->inp_socket = NULL;
 }
 
 /*
  * in_pcbfree_internal() frees an inpcb that has been detached from its
  * socket, and whose reference count has reached 0.  It will also remove the
  * inpcb from any global lists it might remain on.
  */
 static void
 in_pcbfree_internal(struct inpcb *inp)
 {
 	struct inpcbinfo *ipi = inp->inp_pcbinfo;
 
 	KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__));
 	KASSERT(inp->inp_refcount == 0, ("%s: refcount !0", __func__));
 
 	INP_INFO_WLOCK_ASSERT(ipi);
 	INP_WLOCK_ASSERT(inp);
 
 #ifdef IPSEC
 	if (inp->inp_sp != NULL)
 		ipsec_delete_pcbpolicy(inp);
 #endif /* IPSEC */
 	inp->inp_gencnt = ++ipi->ipi_gencnt;
 	in_pcbremlists(inp);
 #ifdef INET6
 	if (inp->inp_vflag & INP_IPV6PROTO) {
 		ip6_freepcbopts(inp->in6p_outputopts);
 		ip6_freemoptions(inp->in6p_moptions);
 	}
 #endif
 	if (inp->inp_options)
 		(void)m_free(inp->inp_options);
 	if (inp->inp_moptions != NULL)
 		inp_freemoptions(inp->inp_moptions);
 	inp->inp_vflag = 0;
 	crfree(inp->inp_cred);
 
 #ifdef MAC
 	mac_inpcb_destroy(inp);
 #endif
 	INP_WUNLOCK(inp);
 	uma_zfree(ipi->ipi_zone, inp);
 }
 
 /*
  * in_pcbref() bumps the reference count on an inpcb in order to maintain
  * stability of an inpcb pointer despite the inpcb lock being released.  This
  * is used in TCP when the inpcbinfo lock needs to be acquired or upgraded,
  * but where the inpcb lock is already held.
  *
  * While the inpcb will not be freed, releasing the inpcb lock means that the
  * connection's state may change, so the caller should be careful to
  * revalidate any cached state on reacquiring the lock.  Drop the reference
  * using in_pcbrele().
  */
 void
 in_pcbref(struct inpcb *inp)
 {
 
 	INP_WLOCK_ASSERT(inp);
 
 	KASSERT(inp->inp_refcount > 0, ("%s: refcount 0", __func__));
 
 	inp->inp_refcount++;
 }
 
 /*
  * Drop a refcount on an inpcb elevated using in_pcbref(); because a call to
  * in_pcbfree() may have been made between in_pcbref() and in_pcbrele(), we
  * return a flag indicating whether or not the inpcb remains valid.  If it is
  * valid, we return with the inpcb lock held.
  */
 int
 in_pcbrele(struct inpcb *inp)
 {
 #ifdef INVARIANTS
 	struct inpcbinfo *ipi = inp->inp_pcbinfo;
 #endif
 
 	KASSERT(inp->inp_refcount > 0, ("%s: refcount 0", __func__));
 
 	INP_INFO_WLOCK_ASSERT(ipi);
 	INP_WLOCK_ASSERT(inp);
 
 	inp->inp_refcount--;
 	if (inp->inp_refcount > 0)
 		return (0);
 	in_pcbfree_internal(inp);
 	return (1);
 }
 
 /*
  * Unconditionally schedule an inpcb to be freed by decrementing its
  * reference count, which should occur only after the inpcb has been detached
  * from its socket.  If another thread holds a temporary reference (acquired
  * using in_pcbref()) then the free is deferred until that reference is
  * released using in_pcbrele(), but the inpcb is still unlocked.
  */
 void
 in_pcbfree(struct inpcb *inp)
 {
 #ifdef INVARIANTS
 	struct inpcbinfo *ipi = inp->inp_pcbinfo;
 #endif
 
 	KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL",
 	    __func__));
 
 	INP_INFO_WLOCK_ASSERT(ipi);
 	INP_WLOCK_ASSERT(inp);
 
 	if (!in_pcbrele(inp))
 		INP_WUNLOCK(inp);
 }
 
 /*
  * in_pcbdrop() removes an inpcb from hashed lists, releasing its address and
  * port reservation, and preventing it from being returned by inpcb lookups.
  *
  * It is used by TCP to mark an inpcb as unused and avoid future packet
  * delivery or event notification when a socket remains open but TCP has
  * closed.  This might occur as a result of a shutdown()-initiated TCP close
  * or a RST on the wire, and allows the port binding to be reused while still
  * maintaining the invariant that so_pcb always points to a valid inpcb until
  * in_pcbdetach().
  *
  * XXXRW: An inp_lport of 0 is used to indicate that the inpcb is not on hash
  * lists, but can lead to confusing netstat output, as open sockets with
  * closed TCP connections will no longer appear to have their bound port
  * number.  An explicit flag would be better, as it would allow us to leave
  * the port number intact after the connection is dropped.
  *
  * XXXRW: Possibly in_pcbdrop() should also prevent future notifications by
  * in_pcbnotifyall() and in_pcbpurgeif0()?
  */
 void
 in_pcbdrop(struct inpcb *inp)
 {
 
 	INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo);
 	INP_WLOCK_ASSERT(inp);
 
 	inp->inp_vflag |= INP_DROPPED;
 	if (inp->inp_lport) {
 		struct inpcbport *phd = inp->inp_phd;
 
 		LIST_REMOVE(inp, inp_hash);
 		LIST_REMOVE(inp, inp_portlist);
 		if (LIST_FIRST(&phd->phd_pcblist) == NULL) {
 			LIST_REMOVE(phd, phd_hash);
 			free(phd, M_PCB);
 		}
 		inp->inp_lport = 0;
 	}
 }
 
 /*
  * Common routines to return the socket addresses associated with inpcbs.
  */
 struct sockaddr *
 in_sockaddr(in_port_t port, struct in_addr *addr_p)
 {
 	struct sockaddr_in *sin;
 
 	sin = malloc(sizeof *sin, M_SONAME,
 		M_WAITOK | M_ZERO);
 	sin->sin_family = AF_INET;
 	sin->sin_len = sizeof(*sin);
 	sin->sin_addr = *addr_p;
 	sin->sin_port = port;
 
 	return (struct sockaddr *)sin;
 }
 
 int
 in_getsockaddr(struct socket *so, struct sockaddr **nam)
 {
 	struct inpcb *inp;
 	struct in_addr addr;
 	in_port_t port;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("in_getsockaddr: inp == NULL"));
 
 	INP_RLOCK(inp);
 	port = inp->inp_lport;
 	addr = inp->inp_laddr;
 	INP_RUNLOCK(inp);
 
 	*nam = in_sockaddr(port, &addr);
 	return 0;
 }
 
 int
 in_getpeeraddr(struct socket *so, struct sockaddr **nam)
 {
 	struct inpcb *inp;
 	struct in_addr addr;
 	in_port_t port;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("in_getpeeraddr: inp == NULL"));
 
 	INP_RLOCK(inp);
 	port = inp->inp_fport;
 	addr = inp->inp_faddr;
 	INP_RUNLOCK(inp);
 
 	*nam = in_sockaddr(port, &addr);
 	return 0;
 }
 
 void
 in_pcbnotifyall(struct inpcbinfo *pcbinfo, struct in_addr faddr, int errno,
     struct inpcb *(*notify)(struct inpcb *, int))
 {
 	struct inpcb *inp, *inp_temp;
 
 	INP_INFO_WLOCK(pcbinfo);
 	LIST_FOREACH_SAFE(inp, pcbinfo->ipi_listhead, inp_list, inp_temp) {
 		INP_WLOCK(inp);
 #ifdef INET6
 		if ((inp->inp_vflag & INP_IPV4) == 0) {
 			INP_WUNLOCK(inp);
 			continue;
 		}
 #endif
 		if (inp->inp_faddr.s_addr != faddr.s_addr ||
 		    inp->inp_socket == NULL) {
 			INP_WUNLOCK(inp);
 			continue;
 		}
 		if ((*notify)(inp, errno))
 			INP_WUNLOCK(inp);
 	}
 	INP_INFO_WUNLOCK(pcbinfo);
 }
 
 void
 in_pcbpurgeif0(struct inpcbinfo *pcbinfo, struct ifnet *ifp)
 {
 	struct inpcb *inp;
 	struct ip_moptions *imo;
 	int i, gap;
 
 	INP_INFO_RLOCK(pcbinfo);
 	LIST_FOREACH(inp, pcbinfo->ipi_listhead, inp_list) {
 		INP_WLOCK(inp);
 		imo = inp->inp_moptions;
 		if ((inp->inp_vflag & INP_IPV4) &&
 		    imo != NULL) {
 			/*
 			 * Unselect the outgoing interface if it is being
 			 * detached.
 			 */
 			if (imo->imo_multicast_ifp == ifp)
 				imo->imo_multicast_ifp = NULL;
 
 			/*
 			 * Drop multicast group membership if we joined
 			 * through the interface being detached.
 			 */
 			for (i = 0, gap = 0; i < imo->imo_num_memberships;
 			    i++) {
 				if (imo->imo_membership[i]->inm_ifp == ifp) {
 					in_delmulti(imo->imo_membership[i]);
 					gap++;
 				} else if (gap != 0)
 					imo->imo_membership[i - gap] =
 					    imo->imo_membership[i];
 			}
 			imo->imo_num_memberships -= gap;
 		}
 		INP_WUNLOCK(inp);
 	}
 	INP_INFO_RUNLOCK(pcbinfo);
 }
 
 /*
  * Lookup a PCB based on the local address and port.
  */
 #define INP_LOOKUP_MAPPED_PCB_COST	3
 struct inpcb *
 in_pcblookup_local(struct inpcbinfo *pcbinfo, struct in_addr laddr,
     u_short lport, int wild_okay, struct ucred *cred)
 {
 	struct inpcb *inp;
 #ifdef INET6
 	int matchwild = 3 + INP_LOOKUP_MAPPED_PCB_COST;
 #else
 	int matchwild = 3;
 #endif
 	int wildcard;
 
 	INP_INFO_LOCK_ASSERT(pcbinfo);
 
 	if (!wild_okay) {
 		struct inpcbhead *head;
 		/*
 		 * Look for an unconnected (wildcard foreign addr) PCB that
 		 * matches the local address and port we're looking for.
 		 */
 		head = &pcbinfo->ipi_hashbase[INP_PCBHASH(INADDR_ANY, lport,
 		    0, pcbinfo->ipi_hashmask)];
 		LIST_FOREACH(inp, head, inp_hash) {
 #ifdef INET6
 			/* XXX inp locking */
 			if ((inp->inp_vflag & INP_IPV4) == 0)
 				continue;
 #endif
 			if (inp->inp_faddr.s_addr == INADDR_ANY &&
 			    inp->inp_laddr.s_addr == laddr.s_addr &&
 			    inp->inp_lport == lport) {
 				/*
 				 * Found?
 				 */
 				if (cred == NULL ||
 				    inp->inp_cred->cr_prison == cred->cr_prison)
 					return (inp);
 			}
 		}
 		/*
 		 * Not found.
 		 */
 		return (NULL);
 	} else {
 		struct inpcbporthead *porthash;
 		struct inpcbport *phd;
 		struct inpcb *match = NULL;
 		/*
 		 * Best fit PCB lookup.
 		 *
 		 * First see if this local port is in use by looking on the
 		 * port hash list.
 		 */
 		porthash = &pcbinfo->ipi_porthashbase[INP_PCBPORTHASH(lport,
 		    pcbinfo->ipi_porthashmask)];
 		LIST_FOREACH(phd, porthash, phd_hash) {
 			if (phd->phd_port == lport)
 				break;
 		}
 		if (phd != NULL) {
 			/*
 			 * Port is in use by one or more PCBs. Look for best
 			 * fit.
 			 */
 			LIST_FOREACH(inp, &phd->phd_pcblist, inp_portlist) {
 				wildcard = 0;
 				if (cred != NULL &&
 				    inp->inp_cred->cr_prison != cred->cr_prison)
 					continue;
 #ifdef INET6
 				/* XXX inp locking */
 				if ((inp->inp_vflag & INP_IPV4) == 0)
 					continue;
 				/*
 				 * We never select the PCB that has
 				 * INP_IPV6 flag and is bound to :: if
 				 * we have another PCB which is bound
 				 * to 0.0.0.0.  If a PCB has the
 				 * INP_IPV6 flag, then we set its cost
 				 * higher than IPv4 only PCBs.
 				 *
 				 * Note that the case only happens
 				 * when a socket is bound to ::, under
 				 * the condition that the use of the
 				 * mapped address is allowed.
 				 */
 				if ((inp->inp_vflag & INP_IPV6) != 0)
 					wildcard += INP_LOOKUP_MAPPED_PCB_COST;
 #endif
 				if (inp->inp_faddr.s_addr != INADDR_ANY)
 					wildcard++;
 				if (inp->inp_laddr.s_addr != INADDR_ANY) {
 					if (laddr.s_addr == INADDR_ANY)
 						wildcard++;
 					else if (inp->inp_laddr.s_addr != laddr.s_addr)
 						continue;
 				} else {
 					if (laddr.s_addr != INADDR_ANY)
 						wildcard++;
 				}
 				if (wildcard < matchwild) {
 					match = inp;
 					matchwild = wildcard;
 					if (matchwild == 0)
 						break;
 				}
 			}
 		}
 		return (match);
 	}
 }
 #undef INP_LOOKUP_MAPPED_PCB_COST
 
 /*
  * Lookup PCB in hash list.
  */
 struct inpcb *
 in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr,
     u_int fport_arg, struct in_addr laddr, u_int lport_arg, int wildcard,
     struct ifnet *ifp)
 {
 	struct inpcbhead *head;
 	struct inpcb *inp, *tmpinp;
 	u_short fport = fport_arg, lport = lport_arg;
 
 	INP_INFO_LOCK_ASSERT(pcbinfo);
 
 	/*
 	 * First look for an exact match.
 	 */
 	tmpinp = NULL;
 	head = &pcbinfo->ipi_hashbase[INP_PCBHASH(faddr.s_addr, lport, fport,
 	    pcbinfo->ipi_hashmask)];
 	LIST_FOREACH(inp, head, inp_hash) {
 #ifdef INET6
 		/* XXX inp locking */
 		if ((inp->inp_vflag & INP_IPV4) == 0)
 			continue;
 #endif
 		if (inp->inp_faddr.s_addr == faddr.s_addr &&
 		    inp->inp_laddr.s_addr == laddr.s_addr &&
 		    inp->inp_fport == fport &&
 		    inp->inp_lport == lport) {
 			/*
 			 * XXX We should be able to directly return
 			 * the inp here, without any checks.
 			 * Well unless both bound with SO_REUSEPORT?
 			 */
 			if (jailed(inp->inp_cred))
 				return (inp);
 			if (tmpinp == NULL)
 				tmpinp = inp;
 		}
 	}
 	if (tmpinp != NULL)
 		return (tmpinp);
 
 	/*
 	 * Then look for a wildcard match, if requested.
 	 */
 	if (wildcard == INPLOOKUP_WILDCARD) {
 		struct inpcb *local_wild = NULL, *local_exact = NULL;
 #ifdef INET6
 		struct inpcb *local_wild_mapped = NULL;
 #endif
 		struct inpcb *jail_wild = NULL;
 		int injail;
 
 		/*
 		 * Order of socket selection - we always prefer jails.
 		 *      1. jailed, non-wild.
 		 *      2. jailed, wild.
 		 *      3. non-jailed, non-wild.
 		 *      4. non-jailed, wild.
 		 */
 
 		head = &pcbinfo->ipi_hashbase[INP_PCBHASH(INADDR_ANY, lport,
 		    0, pcbinfo->ipi_hashmask)];
 		LIST_FOREACH(inp, head, inp_hash) {
 #ifdef INET6
 			/* XXX inp locking */
 			if ((inp->inp_vflag & INP_IPV4) == 0)
 				continue;
 #endif
 			if (inp->inp_faddr.s_addr != INADDR_ANY ||
 			    inp->inp_lport != lport)
 				continue;
 
 			/* XXX inp locking */
 			if (ifp && ifp->if_type == IFT_FAITH &&
 			    (inp->inp_flags & INP_FAITH) == 0)
 				continue;
 
 			injail = jailed(inp->inp_cred);
 			if (injail) {
 				if (!prison_check_ip4(inp->inp_cred, &laddr))
 					continue;
 			} else {
 				if (local_exact != NULL)
 					continue;
 			}
 
 			if (inp->inp_laddr.s_addr == laddr.s_addr) {
 				if (injail)
 					return (inp);
 				else
 					local_exact = inp;
 			} else if (inp->inp_laddr.s_addr == INADDR_ANY) {
 #ifdef INET6
 				/* XXX inp locking, NULL check */
 				if (inp->inp_vflag & INP_IPV6PROTO)
 					local_wild_mapped = inp;
 				else
 #endif /* INET6 */
 					if (injail)
 						jail_wild = inp;
 					else
 						local_wild = inp;
 			}
 		} /* LIST_FOREACH */
 		if (jail_wild != NULL)
 			return (jail_wild);
 		if (local_exact != NULL)
 			return (local_exact);
 		if (local_wild != NULL)
 			return (local_wild);
 #ifdef INET6
 		if (local_wild_mapped != NULL)
 			return (local_wild_mapped);
 #endif /* defined(INET6) */
 	} /* if (wildcard == INPLOOKUP_WILDCARD) */
 
 	return (NULL);
 }
 
 /*
  * Insert PCB onto various hash lists.
  */
 int
 in_pcbinshash(struct inpcb *inp)
 {
 	struct inpcbhead *pcbhash;
 	struct inpcbporthead *pcbporthash;
 	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
 	struct inpcbport *phd;
 	u_int32_t hashkey_faddr;
 
 	INP_INFO_WLOCK_ASSERT(pcbinfo);
 	INP_WLOCK_ASSERT(inp);
 
 #ifdef INET6
 	if (inp->inp_vflag & INP_IPV6)
 		hashkey_faddr = inp->in6p_faddr.s6_addr32[3] /* XXX */;
 	else
 #endif /* INET6 */
 	hashkey_faddr = inp->inp_faddr.s_addr;
 
 	pcbhash = &pcbinfo->ipi_hashbase[INP_PCBHASH(hashkey_faddr,
 		 inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)];
 
 	pcbporthash = &pcbinfo->ipi_porthashbase[
 	    INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_porthashmask)];
 
 	/*
 	 * Go through port list and look for a head for this lport.
 	 */
 	LIST_FOREACH(phd, pcbporthash, phd_hash) {
 		if (phd->phd_port == inp->inp_lport)
 			break;
 	}
 	/*
 	 * If none exists, malloc one and tack it on.
 	 */
 	if (phd == NULL) {
 		phd = malloc(sizeof(struct inpcbport), M_PCB, M_NOWAIT);
 		if (phd == NULL) {
 			return (ENOBUFS); /* XXX */
 		}
 		phd->phd_port = inp->inp_lport;
 		LIST_INIT(&phd->phd_pcblist);
 		LIST_INSERT_HEAD(pcbporthash, phd, phd_hash);
 	}
 	inp->inp_phd = phd;
 	LIST_INSERT_HEAD(&phd->phd_pcblist, inp, inp_portlist);
 	LIST_INSERT_HEAD(pcbhash, inp, inp_hash);
 	return (0);
 }
 
 /*
  * Move PCB to the proper hash bucket when { faddr, fport } have  been
  * changed. NOTE: This does not handle the case of the lport changing (the
  * hashed port list would have to be updated as well), so the lport must
  * not change after in_pcbinshash() has been called.
  */
 void
 in_pcbrehash(struct inpcb *inp)
 {
 	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
 	struct inpcbhead *head;
 	u_int32_t hashkey_faddr;
 
 	INP_INFO_WLOCK_ASSERT(pcbinfo);
 	INP_WLOCK_ASSERT(inp);
 
 #ifdef INET6
 	if (inp->inp_vflag & INP_IPV6)
 		hashkey_faddr = inp->in6p_faddr.s6_addr32[3] /* XXX */;
 	else
 #endif /* INET6 */
 	hashkey_faddr = inp->inp_faddr.s_addr;
 
 	head = &pcbinfo->ipi_hashbase[INP_PCBHASH(hashkey_faddr,
 		inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)];
 
 	LIST_REMOVE(inp, inp_hash);
 	LIST_INSERT_HEAD(head, inp, inp_hash);
 }
 
 /*
  * Remove PCB from various lists.
  */
 void
 in_pcbremlists(struct inpcb *inp)
 {
 	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
 
 	INP_INFO_WLOCK_ASSERT(pcbinfo);
 	INP_WLOCK_ASSERT(inp);
 
 	inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
 	if (inp->inp_lport) {
 		struct inpcbport *phd = inp->inp_phd;
 
 		LIST_REMOVE(inp, inp_hash);
 		LIST_REMOVE(inp, inp_portlist);
 		if (LIST_FIRST(&phd->phd_pcblist) == NULL) {
 			LIST_REMOVE(phd, phd_hash);
 			free(phd, M_PCB);
 		}
 	}
 	LIST_REMOVE(inp, inp_list);
 	pcbinfo->ipi_count--;
 }
 
 /*
  * A set label operation has occurred at the socket layer, propagate the
  * label change into the in_pcb for the socket.
  */
 void
 in_pcbsosetlabel(struct socket *so)
 {
 #ifdef MAC
 	struct inpcb *inp;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("in_pcbsosetlabel: so->so_pcb == NULL"));
 
 	INP_WLOCK(inp);
 	SOCK_LOCK(so);
 	mac_inpcb_sosetlabel(so, inp);
 	SOCK_UNLOCK(so);
 	INP_WUNLOCK(inp);
 #endif
 }
 
 /*
  * ipport_tick runs once per second, determining if random port allocation
  * should be continued.  If more than ipport_randomcps ports have been
  * allocated in the last second, then we return to sequential port
  * allocation. We return to random allocation only once we drop below
  * ipport_randomcps for at least ipport_randomtime seconds.
  */
 void
 ipport_tick(void *xtp)
 {
 	VNET_ITERATOR_DECL(vnet_iter);
 
 	VNET_LIST_RLOCK();
 	VNET_FOREACH(vnet_iter) {
 		CURVNET_SET(vnet_iter);	/* XXX appease INVARIANTS here */
 		INIT_VNET_INET(vnet_iter);
 		if (V_ipport_tcpallocs <=
 		    V_ipport_tcplastcount + V_ipport_randomcps) {
 			if (V_ipport_stoprandom > 0)
 				V_ipport_stoprandom--;
 		} else
 			V_ipport_stoprandom = V_ipport_randomtime;
 		V_ipport_tcplastcount = V_ipport_tcpallocs;
 		CURVNET_RESTORE();
 	}
 	VNET_LIST_RUNLOCK();
 	callout_reset(&ipport_tick_callout, hz, ipport_tick, NULL);
 }
 
 void
 inp_wlock(struct inpcb *inp)
 {
 
 	INP_WLOCK(inp);
 }
 
 void
 inp_wunlock(struct inpcb *inp)
 {
 
 	INP_WUNLOCK(inp);
 }
 
 void
 inp_rlock(struct inpcb *inp)
 {
 
 	INP_RLOCK(inp);
 }
 
 void
 inp_runlock(struct inpcb *inp)
 {
 
 	INP_RUNLOCK(inp);
 }
 
 #ifdef INVARIANTS
 void
 inp_lock_assert(struct inpcb *inp)
 {
 
 	INP_WLOCK_ASSERT(inp);
 }
 
 void
 inp_unlock_assert(struct inpcb *inp)
 {
 
 	INP_UNLOCK_ASSERT(inp);
 }
 #endif
 
 void
 inp_apply_all(void (*func)(struct inpcb *, void *), void *arg)
 {
 	INIT_VNET_INET(curvnet);
 	struct inpcb *inp;
 
 	INP_INFO_RLOCK(&V_tcbinfo);
 	LIST_FOREACH(inp, V_tcbinfo.ipi_listhead, inp_list) {
 		INP_WLOCK(inp);
 		func(inp, arg);
 		INP_WUNLOCK(inp);
 	}
 	INP_INFO_RUNLOCK(&V_tcbinfo);
 }
 
 struct socket *
 inp_inpcbtosocket(struct inpcb *inp)
 {
 
 	INP_WLOCK_ASSERT(inp);
 	return (inp->inp_socket);
 }
 
 struct tcpcb *
 inp_inpcbtotcpcb(struct inpcb *inp)
 {
 
 	INP_WLOCK_ASSERT(inp);
 	return ((struct tcpcb *)inp->inp_ppcb);
 }
 
 int
 inp_ip_tos_get(const struct inpcb *inp)
 {
 
 	return (inp->inp_ip_tos);
 }
 
 void
 inp_ip_tos_set(struct inpcb *inp, int val)
 {
 
 	inp->inp_ip_tos = val;
 }
 
 void
 inp_4tuple_get(struct inpcb *inp, uint32_t *laddr, uint16_t *lp,
     uint32_t *faddr, uint16_t *fp)
 {
 
 	INP_LOCK_ASSERT(inp);
 	*laddr = inp->inp_laddr.s_addr;
 	*faddr = inp->inp_faddr.s_addr;
 	*lp = inp->inp_lport;
 	*fp = inp->inp_fport;
 }
 
 struct inpcb *
 so_sotoinpcb(struct socket *so)
 {
 
 	return (sotoinpcb(so));
 }
 
 struct tcpcb *
 so_sototcpcb(struct socket *so)
 {
 
 	return (sototcpcb(so));
 }
 
 #ifdef DDB
 static void
 db_print_indent(int indent)
 {
 	int i;
 
 	for (i = 0; i < indent; i++)
 		db_printf(" ");
 }
 
 static void
 db_print_inconninfo(struct in_conninfo *inc, const char *name, int indent)
 {
 	char faddr_str[48], laddr_str[48];
 
 	db_print_indent(indent);
 	db_printf("%s at %p\n", name, inc);
 
 	indent += 2;
 
 #ifdef INET6
-	if (inc->inc_flags == 1) {
+	if (inc->inc_flags & INC_ISIPV6) {
 		/* IPv6. */
 		ip6_sprintf(laddr_str, &inc->inc6_laddr);
 		ip6_sprintf(faddr_str, &inc->inc6_faddr);
 	} else {
 #endif
 		/* IPv4. */
 		inet_ntoa_r(inc->inc_laddr, laddr_str);
 		inet_ntoa_r(inc->inc_faddr, faddr_str);
 #ifdef INET6
 	}
 #endif
 	db_print_indent(indent);
 	db_printf("inc_laddr %s   inc_lport %u\n", laddr_str,
 	    ntohs(inc->inc_lport));
 	db_print_indent(indent);
 	db_printf("inc_faddr %s   inc_fport %u\n", faddr_str,
 	    ntohs(inc->inc_fport));
 }
 
 static void
 db_print_inpflags(int inp_flags)
 {
 	int comma;
 
 	comma = 0;
 	if (inp_flags & INP_RECVOPTS) {
 		db_printf("%sINP_RECVOPTS", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & INP_RECVRETOPTS) {
 		db_printf("%sINP_RECVRETOPTS", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & INP_RECVDSTADDR) {
 		db_printf("%sINP_RECVDSTADDR", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & INP_HDRINCL) {
 		db_printf("%sINP_HDRINCL", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & INP_HIGHPORT) {
 		db_printf("%sINP_HIGHPORT", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & INP_LOWPORT) {
 		db_printf("%sINP_LOWPORT", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & INP_ANONPORT) {
 		db_printf("%sINP_ANONPORT", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & INP_RECVIF) {
 		db_printf("%sINP_RECVIF", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & INP_MTUDISC) {
 		db_printf("%sINP_MTUDISC", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & INP_FAITH) {
 		db_printf("%sINP_FAITH", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & INP_RECVTTL) {
 		db_printf("%sINP_RECVTTL", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & INP_DONTFRAG) {
 		db_printf("%sINP_DONTFRAG", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & IN6P_IPV6_V6ONLY) {
 		db_printf("%sIN6P_IPV6_V6ONLY", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & IN6P_PKTINFO) {
 		db_printf("%sIN6P_PKTINFO", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & IN6P_HOPLIMIT) {
 		db_printf("%sIN6P_HOPLIMIT", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & IN6P_HOPOPTS) {
 		db_printf("%sIN6P_HOPOPTS", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & IN6P_DSTOPTS) {
 		db_printf("%sIN6P_DSTOPTS", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & IN6P_RTHDR) {
 		db_printf("%sIN6P_RTHDR", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & IN6P_RTHDRDSTOPTS) {
 		db_printf("%sIN6P_RTHDRDSTOPTS", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & IN6P_TCLASS) {
 		db_printf("%sIN6P_TCLASS", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & IN6P_AUTOFLOWLABEL) {
 		db_printf("%sIN6P_AUTOFLOWLABEL", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & IN6P_RFC2292) {
 		db_printf("%sIN6P_RFC2292", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & IN6P_MTU) {
 		db_printf("IN6P_MTU%s", comma ? ", " : "");
 		comma = 1;
 	}
 }
 
 static void
 db_print_inpvflag(u_char inp_vflag)
 {
 	int comma;
 
 	comma = 0;
 	if (inp_vflag & INP_IPV4) {
 		db_printf("%sINP_IPV4", comma ? ", " : "");
 		comma  = 1;
 	}
 	if (inp_vflag & INP_IPV6) {
 		db_printf("%sINP_IPV6", comma ? ", " : "");
 		comma  = 1;
 	}
 	if (inp_vflag & INP_IPV6PROTO) {
 		db_printf("%sINP_IPV6PROTO", comma ? ", " : "");
 		comma  = 1;
 	}
 	if (inp_vflag & INP_TIMEWAIT) {
 		db_printf("%sINP_TIMEWAIT", comma ? ", " : "");
 		comma  = 1;
 	}
 	if (inp_vflag & INP_ONESBCAST) {
 		db_printf("%sINP_ONESBCAST", comma ? ", " : "");
 		comma  = 1;
 	}
 	if (inp_vflag & INP_DROPPED) {
 		db_printf("%sINP_DROPPED", comma ? ", " : "");
 		comma  = 1;
 	}
 	if (inp_vflag & INP_SOCKREF) {
 		db_printf("%sINP_SOCKREF", comma ? ", " : "");
 		comma  = 1;
 	}
 }
 
 void
 db_print_inpcb(struct inpcb *inp, const char *name, int indent)
 {
 
 	db_print_indent(indent);
 	db_printf("%s at %p\n", name, inp);
 
 	indent += 2;
 
 	db_print_indent(indent);
 	db_printf("inp_flow: 0x%x\n", inp->inp_flow);
 
 	db_print_inconninfo(&inp->inp_inc, "inp_conninfo", indent);
 
 	db_print_indent(indent);
 	db_printf("inp_ppcb: %p   inp_pcbinfo: %p   inp_socket: %p\n",
 	    inp->inp_ppcb, inp->inp_pcbinfo, inp->inp_socket);
 
 	db_print_indent(indent);
 	db_printf("inp_label: %p   inp_flags: 0x%x (",
 	   inp->inp_label, inp->inp_flags);
 	db_print_inpflags(inp->inp_flags);
 	db_printf(")\n");
 
 	db_print_indent(indent);
 	db_printf("inp_sp: %p   inp_vflag: 0x%x (", inp->inp_sp,
 	    inp->inp_vflag);
 	db_print_inpvflag(inp->inp_vflag);
 	db_printf(")\n");
 
 	db_print_indent(indent);
 	db_printf("inp_ip_ttl: %d   inp_ip_p: %d   inp_ip_minttl: %d\n",
 	    inp->inp_ip_ttl, inp->inp_ip_p, inp->inp_ip_minttl);
 
 	db_print_indent(indent);
 #ifdef INET6
 	if (inp->inp_vflag & INP_IPV6) {
 		db_printf("in6p_options: %p   in6p_outputopts: %p   "
 		    "in6p_moptions: %p\n", inp->in6p_options,
 		    inp->in6p_outputopts, inp->in6p_moptions);
 		db_printf("in6p_icmp6filt: %p   in6p_cksum %d   "
 		    "in6p_hops %u\n", inp->in6p_icmp6filt, inp->in6p_cksum,
 		    inp->in6p_hops);
 	} else
 #endif
 	{
 		db_printf("inp_ip_tos: %d   inp_ip_options: %p   "
 		    "inp_ip_moptions: %p\n", inp->inp_ip_tos,
 		    inp->inp_options, inp->inp_moptions);
 	}
 
 	db_print_indent(indent);
 	db_printf("inp_phd: %p   inp_gencnt: %ju\n", inp->inp_phd,
 	    (uintmax_t)inp->inp_gencnt);
 }
 
 DB_SHOW_COMMAND(inpcb, db_show_inpcb)
 {
 	struct inpcb *inp;
 
 	if (!have_addr) {
 		db_printf("usage: show inpcb <addr>\n");
 		return;
 	}
 	inp = (struct inpcb *)addr;
 
 	db_print_inpcb(inp, "inpcb", 0);
 }
 #endif
Index: head/sys/netinet/in_pcb.h
===================================================================
--- head/sys/netinet/in_pcb.h	(revision 186221)
+++ head/sys/netinet/in_pcb.h	(revision 186222)
@@ -1,511 +1,517 @@
 /*-
  * Copyright (c) 1982, 1986, 1990, 1993
  *	The Regents of the University of California.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)in_pcb.h	8.1 (Berkeley) 6/10/93
  * $FreeBSD$
  */
 
 #ifndef _NETINET_IN_PCB_H_
 #define _NETINET_IN_PCB_H_
 
 #include <sys/queue.h>
 #include <sys/_lock.h>
 #include <sys/_mutex.h>
 #include <sys/_rwlock.h>
 
 #include <net/route.h>
 
 #ifdef _KERNEL
 #include <sys/rwlock.h>
 #endif
 
 #define	in6pcb		inpcb	/* for KAME src sync over BSD*'s */
 #define	in6p_sp		inp_sp	/* for KAME src sync over BSD*'s */
 struct inpcbpolicy;
 
 /*
  * struct inpcb is the common protocol control block structure used in most
  * IP transport protocols.
  *
  * Pointers to local and foreign host table entries, local and foreign socket
  * numbers, and pointers up (to a socket structure) and down (to a
  * protocol-specific control block) are stored here.
  */
 LIST_HEAD(inpcbhead, inpcb);
 LIST_HEAD(inpcbporthead, inpcbport);
 typedef	u_quad_t	inp_gen_t;
 
 /*
  * PCB with AF_INET6 null bind'ed laddr can receive AF_INET input packet.
  * So, AF_INET6 null laddr is also used as AF_INET null laddr, by utilizing
  * the following structure.
  */
 struct in_addr_4in6 {
 	u_int32_t	ia46_pad32[3];
 	struct	in_addr	ia46_addr4;
 };
 
 /*
  * NOTE: ipv6 addrs should be 64-bit aligned, per RFC 2553.  in_conninfo has
  * some extra padding to accomplish this.
  */
 struct in_endpoints {
 	u_int16_t	ie_fport;		/* foreign port */
 	u_int16_t	ie_lport;		/* local port */
 	/* protocol dependent part, local and foreign addr */
 	union {
 		/* foreign host table entry */
 		struct	in_addr_4in6 ie46_foreign;
 		struct	in6_addr ie6_foreign;
 	} ie_dependfaddr;
 	union {
 		/* local host table entry */
 		struct	in_addr_4in6 ie46_local;
 		struct	in6_addr ie6_local;
 	} ie_dependladdr;
 };
 #define	ie_faddr	ie_dependfaddr.ie46_foreign.ia46_addr4
 #define	ie_laddr	ie_dependladdr.ie46_local.ia46_addr4
 #define	ie6_faddr	ie_dependfaddr.ie6_foreign
 #define	ie6_laddr	ie_dependladdr.ie6_local
 
 /*
  * XXX The defines for inc_* are hacks and should be changed to direct
  * references.
  */
 struct in_conninfo {
 	u_int8_t	inc_flags;
 	u_int8_t	inc_len;
 	u_int16_t	inc_fibnum;	/* XXX was pad, 16 bits is plenty */
 	/* protocol dependent part */
 	struct	in_endpoints inc_ie;
 };
+
+/*
+ * Flags for inc_flags.
+ */
+#define	INC_ISIPV6	0x01
+
 #define inc_isipv6	inc_flags	/* temp compatability */
 #define	inc_fport	inc_ie.ie_fport
 #define	inc_lport	inc_ie.ie_lport
 #define	inc_faddr	inc_ie.ie_faddr
 #define	inc_laddr	inc_ie.ie_laddr
 #define	inc6_faddr	inc_ie.ie6_faddr
 #define	inc6_laddr	inc_ie.ie6_laddr
 
 struct	icmp6_filter;
 
 /*-
  * struct inpcb captures the network layer state for TCP, UDP, and raw IPv4
  * and IPv6 sockets.  In the case of TCP, further per-connection state is
  * hung off of inp_ppcb most of the time.  Almost all fields of struct inpcb
  * are static after creation or protected by a per-inpcb rwlock, inp_lock.  A
  * few fields also require the global pcbinfo lock for the inpcb to be held,
  * when modified, such as the global connection lists and hashes, as well as
  * binding information (which affects which hash a connection is on).  This
  * model means that connections can be looked up without holding the
  * per-connection lock, which is important for performance when attempting to
  * find the connection for a packet given its IP and port tuple.  Writing to
  * these fields that write locks be held on both the inpcb and global locks.
  *
  * Key:
  * (c) - Constant after initialization
  * (i) - Protected by the inpcb lock
  * (p) - Protected by the pcbinfo lock for the inpcb
  * (s) - Protected by another subsystem's locks
  * (x) - Undefined locking
  *
  * A few other notes:
  *
  * When a read lock is held, stability of the field is guaranteed; to write
  * to a field, a write lock must generally be held.
  *
  * netinet/netinet6-layer code should not assume that the inp_socket pointer
  * is safe to dereference without inp_lock being held, even for protocols
  * other than TCP (where the inpcb persists during TIMEWAIT even after the
  * socket has been freed), or there may be close(2)-related races.
  *
  * The inp_vflag field is overloaded, and would otherwise ideally be (c).
  */
 struct inpcb {
 	LIST_ENTRY(inpcb) inp_hash;	/* (i/p) hash list */
 	LIST_ENTRY(inpcb) inp_list;	/* (i/p) list for all PCBs for proto */
 	void	*inp_ppcb;		/* (i) pointer to per-protocol pcb */
 	struct	inpcbinfo *inp_pcbinfo;	/* (c) PCB list info */
 	struct	socket *inp_socket;	/* (i) back pointer to socket */
 	struct	ucred	*inp_cred;	/* (c) cache of socket cred */
 	u_int32_t inp_flow;		/* (i) IPv6 flow information */
 	int	inp_flags;		/* (i) generic IP/datagram flags */
 	u_char	inp_vflag;		/* (i) IP version flag (v4/v6) */
 	u_char	inp_ip_ttl;		/* (i) time to live proto */
 	u_char	inp_ip_p;		/* (c) protocol proto */
 	u_char	inp_ip_minttl;		/* (i) minimum TTL or drop */
 	uint32_t inp_ispare1;		/* (x) connection id / queue id */
 	u_int	inp_refcount;		/* (i) refcount */
 	void	*inp_pspare[2];		/* (x) rtentry / general use */
 
 	/* Local and foreign ports, local and foreign addr. */
 	struct	in_conninfo inp_inc;	/* (i/p) list for PCB's local port */
 
 	/* MAC and IPSEC policy information. */
 	struct	label *inp_label;	/* (i) MAC label */
 	struct	inpcbpolicy *inp_sp;    /* (s) for IPSEC */
 
 	/* Protocol-dependent part; options. */
 	struct {
 		u_char	inp4_ip_tos;		/* (i) type of service proto */
 		struct	mbuf *inp4_options;	/* (i) IP options */
 		struct	ip_moptions *inp4_moptions; /* (i) IP mcast options */
 	} inp_depend4;
 	struct {
 		/* (i) IP options */
 		struct	mbuf *inp6_options;
 		/* (i) IP6 options for outgoing packets */
 		struct	ip6_pktopts *inp6_outputopts;
 		/* (i) IP multicast options */
 		struct	ip6_moptions *inp6_moptions;
 		/* (i) ICMPv6 code type filter */
 		struct	icmp6_filter *inp6_icmp6filt;
 		/* (i) IPV6_CHECKSUM setsockopt */
 		int	inp6_cksum;
 		short	inp6_hops;
 	} inp_depend6;
 	LIST_ENTRY(inpcb) inp_portlist;	/* (i/p) */
 	struct	inpcbport *inp_phd;	/* (i/p) head of this list */
 #define inp_zero_size offsetof(struct inpcb, inp_gencnt)
 	inp_gen_t	inp_gencnt;	/* (c) generation count */
 	struct rwlock	inp_lock;
 };
 #define	inp_fport	inp_inc.inc_fport
 #define	inp_lport	inp_inc.inc_lport
 #define	inp_faddr	inp_inc.inc_faddr
 #define	inp_laddr	inp_inc.inc_laddr
 #define	inp_ip_tos	inp_depend4.inp4_ip_tos
 #define	inp_options	inp_depend4.inp4_options
 #define	inp_moptions	inp_depend4.inp4_moptions
 
 #define	in6p_faddr	inp_inc.inc6_faddr
 #define	in6p_laddr	inp_inc.inc6_laddr
 #define	in6p_hops	inp_depend6.inp6_hops	/* default hop limit */
 #define	in6p_ip6_nxt	inp_ip_p
 #define	in6p_flowinfo	inp_flow
 #define	in6p_vflag	inp_vflag
 #define	in6p_options	inp_depend6.inp6_options
 #define	in6p_outputopts	inp_depend6.inp6_outputopts
 #define	in6p_moptions	inp_depend6.inp6_moptions
 #define	in6p_icmp6filt	inp_depend6.inp6_icmp6filt
 #define	in6p_cksum	inp_depend6.inp6_cksum
 #define	in6p_flags	inp_flags  /* for KAME src sync over BSD*'s */
 #define	in6p_socket	inp_socket  /* for KAME src sync over BSD*'s */
 #define	in6p_lport	inp_lport  /* for KAME src sync over BSD*'s */
 #define	in6p_fport	inp_fport  /* for KAME src sync over BSD*'s */
 #define	in6p_ppcb	inp_ppcb  /* for KAME src sync over BSD*'s */
 
 /*
  * The range of the generation count, as used in this implementation, is 9e19.
  * We would have to create 300 billion connections per second for this number
  * to roll over in a year.  This seems sufficiently unlikely that we simply
  * don't concern ourselves with that possibility.
  */
 
 /*
  * Interface exported to userland by various protocols which use inpcbs.  Hack
  * alert -- only define if struct xsocket is in scope.
  */
 #ifdef _SYS_SOCKETVAR_H_
 struct	xinpcb {
 	size_t	xi_len;		/* length of this structure */
 	struct	inpcb xi_inp;
 	struct	xsocket xi_socket;
 	u_quad_t	xi_alignment_hack;
 };
 
 struct	xinpgen {
 	size_t	xig_len;	/* length of this structure */
 	u_int	xig_count;	/* number of PCBs at this time */
 	inp_gen_t xig_gen;	/* generation count at this time */
 	so_gen_t xig_sogen;	/* socket generation count at this time */
 };
 #endif /* _SYS_SOCKETVAR_H_ */
 
 struct inpcbport {
 	LIST_ENTRY(inpcbport) phd_hash;
 	struct inpcbhead phd_pcblist;
 	u_short phd_port;
 };
 
 /*
  * Global data structure for each high-level protocol (UDP, TCP, ...) in both
  * IPv4 and IPv6.  Holds inpcb lists and information for managing them.
  */
 struct inpcbinfo {
 	/*
 	 * Global list of inpcbs on the protocol.
 	 */
 	struct inpcbhead	*ipi_listhead;
 	u_int			 ipi_count;
 
 	/*
 	 * Global hash of inpcbs, hashed by local and foreign addresses and
 	 * port numbers.
 	 */
 	struct inpcbhead	*ipi_hashbase;
 	u_long			 ipi_hashmask;
 
 	/*
 	 * Global hash of inpcbs, hashed by only local port number.
 	 */
 	struct inpcbporthead	*ipi_porthashbase;
 	u_long			 ipi_porthashmask;
 
 	/*
 	 * Fields associated with port lookup and allocation.
 	 */
 	u_short			 ipi_lastport;
 	u_short			 ipi_lastlow;
 	u_short			 ipi_lasthi;
 
 	/*
 	 * UMA zone from which inpcbs are allocated for this protocol.
 	 */
 	struct	uma_zone	*ipi_zone;
 
 	/*
 	 * Generation count--incremented each time a connection is allocated
 	 * or freed.
 	 */
 	u_quad_t		 ipi_gencnt;
 	struct rwlock		 ipi_lock;
 
 	/*
 	 * vimage 1
 	 * general use 1
 	 */
 	void 			*ipi_pspare[2];
 };
 
 #define INP_LOCK_INIT(inp, d, t) \
 	rw_init_flags(&(inp)->inp_lock, (t), RW_RECURSE |  RW_DUPOK)
 #define INP_LOCK_DESTROY(inp)	rw_destroy(&(inp)->inp_lock)
 #define INP_RLOCK(inp)		rw_rlock(&(inp)->inp_lock)
 #define INP_WLOCK(inp)		rw_wlock(&(inp)->inp_lock)
 #define INP_TRY_RLOCK(inp)	rw_try_rlock(&(inp)->inp_lock)
 #define INP_TRY_WLOCK(inp)	rw_try_wlock(&(inp)->inp_lock)
 #define INP_RUNLOCK(inp)	rw_runlock(&(inp)->inp_lock)
 #define INP_WUNLOCK(inp)	rw_wunlock(&(inp)->inp_lock)
 #define INP_LOCK_ASSERT(inp)	rw_assert(&(inp)->inp_lock, RA_LOCKED)
 #define	INP_RLOCK_ASSERT(inp)	rw_assert(&(inp)->inp_lock, RA_RLOCKED)
 #define	INP_WLOCK_ASSERT(inp)	rw_assert(&(inp)->inp_lock, RA_WLOCKED)
 #define	INP_UNLOCK_ASSERT(inp)	rw_assert(&(inp)->inp_lock, RA_UNLOCKED)
 
 #ifdef _KERNEL
 /*
  * These locking functions are for inpcb consumers outside of sys/netinet,
  * more specifically, they were added for the benefit of TOE drivers. The
  * macros are reserved for use by the stack.
  */
 void inp_wlock(struct inpcb *);
 void inp_wunlock(struct inpcb *);
 void inp_rlock(struct inpcb *);
 void inp_runlock(struct inpcb *);
 
 #ifdef INVARIANTS
 void inp_lock_assert(struct inpcb *);
 void inp_unlock_assert(struct inpcb *);
 #else
 static __inline void
 inp_lock_assert(struct inpcb *inp __unused)
 {
 }
 
 static __inline void
 inp_unlock_assert(struct inpcb *inp __unused)
 {
 }
 
 #endif
 
 void	inp_apply_all(void (*func)(struct inpcb *, void *), void *arg);
 int 	inp_ip_tos_get(const struct inpcb *inp);
 void 	inp_ip_tos_set(struct inpcb *inp, int val);
 struct socket *
 	inp_inpcbtosocket(struct inpcb *inp);
 struct tcpcb *
 	inp_inpcbtotcpcb(struct inpcb *inp);
 void 	inp_4tuple_get(struct inpcb *inp, uint32_t *laddr, uint16_t *lp,
 		uint32_t *faddr, uint16_t *fp);
 
 #endif /* _KERNEL */
 
 #define INP_INFO_LOCK_INIT(ipi, d) \
 	rw_init_flags(&(ipi)->ipi_lock, (d), RW_RECURSE)
 #define INP_INFO_LOCK_DESTROY(ipi)  rw_destroy(&(ipi)->ipi_lock)
 #define INP_INFO_RLOCK(ipi)	rw_rlock(&(ipi)->ipi_lock)
 #define INP_INFO_WLOCK(ipi)	rw_wlock(&(ipi)->ipi_lock)
 #define INP_INFO_TRY_RLOCK(ipi)	rw_try_rlock(&(ipi)->ipi_lock)
 #define INP_INFO_TRY_WLOCK(ipi)	rw_try_wlock(&(ipi)->ipi_lock)
 #define INP_INFO_RUNLOCK(ipi)	rw_runlock(&(ipi)->ipi_lock)
 #define INP_INFO_WUNLOCK(ipi)	rw_wunlock(&(ipi)->ipi_lock)
 #define	INP_INFO_LOCK_ASSERT(ipi)	rw_assert(&(ipi)->ipi_lock, RA_LOCKED)
 #define INP_INFO_RLOCK_ASSERT(ipi)	rw_assert(&(ipi)->ipi_lock, RA_RLOCKED)
 #define INP_INFO_WLOCK_ASSERT(ipi)	rw_assert(&(ipi)->ipi_lock, RA_WLOCKED)
 #define INP_INFO_UNLOCK_ASSERT(ipi)	rw_assert(&(ipi)->ipi_lock, RA_UNLOCKED)
 
 #define INP_PCBHASH(faddr, lport, fport, mask) \
 	(((faddr) ^ ((faddr) >> 16) ^ ntohs((lport) ^ (fport))) & (mask))
 #define INP_PCBPORTHASH(lport, mask) \
 	(ntohs((lport)) & (mask))
 
 /*
  * Flags for inp_vflags -- historically version flags only, but now quite a
  * bit more due to an overflow of inp_flag, leading to some locking ambiguity
  * as some bits are stable from initial allocation, and others may change.
  */
 #define	INP_IPV4	0x1
 #define	INP_IPV6	0x2
 #define	INP_IPV6PROTO	0x4		/* opened under IPv6 protocol */
 #define	INP_TIMEWAIT	0x8		/* inpcb in TIMEWAIT, ppcb is tcptw */
 #define	INP_ONESBCAST	0x10		/* send all-ones broadcast */
 #define	INP_DROPPED	0x20		/* protocol drop flag */
 #define	INP_SOCKREF	0x40		/* strong socket reference */
 
 /*
  * Flags for inp_flag.
  */
 #define	INP_RECVOPTS		0x01	/* receive incoming IP options */
 #define	INP_RECVRETOPTS		0x02	/* receive IP options for reply */
 #define	INP_RECVDSTADDR		0x04	/* receive IP dst address */
 #define	INP_HDRINCL		0x08	/* user supplies entire IP header */
 #define	INP_HIGHPORT		0x10	/* user wants "high" port binding */
 #define	INP_LOWPORT		0x20	/* user wants "low" port binding */
 #define	INP_ANONPORT		0x40	/* port chosen for user */
 #define	INP_RECVIF		0x80	/* receive incoming interface */
 #define	INP_MTUDISC		0x100	/* user can do MTU discovery */
 #define	INP_FAITH		0x200	/* accept FAITH'ed connections */
 #define	INP_RECVTTL		0x400	/* receive incoming IP TTL */
 #define	INP_DONTFRAG		0x800	/* don't fragment packet */
 
 #define IN6P_IPV6_V6ONLY	0x008000 /* restrict AF_INET6 socket for v6 */
 
 #define	IN6P_PKTINFO		0x010000 /* receive IP6 dst and I/F */
 #define	IN6P_HOPLIMIT		0x020000 /* receive hoplimit */
 #define	IN6P_HOPOPTS		0x040000 /* receive hop-by-hop options */
 #define	IN6P_DSTOPTS		0x080000 /* receive dst options after rthdr */
 #define	IN6P_RTHDR		0x100000 /* receive routing header */
 #define	IN6P_RTHDRDSTOPTS	0x200000 /* receive dstoptions before rthdr */
 #define	IN6P_TCLASS		0x400000 /* receive traffic class value */
 #define	IN6P_AUTOFLOWLABEL	0x800000 /* attach flowlabel automatically */
 #define	IN6P_RFC2292		0x40000000 /* used RFC2292 API on the socket */
 #define	IN6P_MTU		0x80000000 /* receive path MTU */
 
 #define	INP_CONTROLOPTS		(INP_RECVOPTS|INP_RECVRETOPTS|INP_RECVDSTADDR|\
 				 INP_RECVIF|INP_RECVTTL|\
 				 IN6P_PKTINFO|IN6P_HOPLIMIT|IN6P_HOPOPTS|\
 				 IN6P_DSTOPTS|IN6P_RTHDR|IN6P_RTHDRDSTOPTS|\
 				 IN6P_TCLASS|IN6P_AUTOFLOWLABEL|IN6P_RFC2292|\
 				 IN6P_MTU)
 #define	INP_UNMAPPABLEOPTS	(IN6P_HOPOPTS|IN6P_DSTOPTS|IN6P_RTHDR|\
 				 IN6P_TCLASS|IN6P_AUTOFLOWLABEL)
 
  /* for KAME src sync over BSD*'s */
 #define	IN6P_HIGHPORT		INP_HIGHPORT
 #define	IN6P_LOWPORT		INP_LOWPORT
 #define	IN6P_ANONPORT		INP_ANONPORT
 #define	IN6P_RECVIF		INP_RECVIF
 #define	IN6P_MTUDISC		INP_MTUDISC
 #define	IN6P_FAITH		INP_FAITH
 #define	IN6P_CONTROLOPTS INP_CONTROLOPTS
 	/*
 	 * socket AF version is {newer than,or include}
 	 * actual datagram AF version
 	 */
 
 #define	INPLOOKUP_WILDCARD	1
 #define	sotoinpcb(so)	((struct inpcb *)(so)->so_pcb)
 #define	sotoin6pcb(so)	sotoinpcb(so) /* for KAME src sync over BSD*'s */
 
 #define	INP_SOCKAF(so) so->so_proto->pr_domain->dom_family
 
 #define	INP_CHECK_SOCKAF(so, af)	(INP_SOCKAF(so) == af)
 
 #ifdef _KERNEL
 #ifdef VIMAGE_GLOBALS
 extern int	ipport_reservedhigh;
 extern int	ipport_reservedlow;
 extern int	ipport_lowfirstauto;
 extern int	ipport_lowlastauto;
 extern int	ipport_firstauto;
 extern int	ipport_lastauto;
 extern int	ipport_hifirstauto;
 extern int	ipport_hilastauto;
 extern int	ipport_randomized;
 extern int	ipport_randomcps;
 extern int	ipport_randomtime;
 extern int	ipport_stoprandom;
 extern int	ipport_tcpallocs;
 #endif
 extern struct callout ipport_tick_callout;
 
 void	in_pcbpurgeif0(struct inpcbinfo *, struct ifnet *);
 int	in_pcballoc(struct socket *, struct inpcbinfo *);
 int	in_pcbbind(struct inpcb *, struct sockaddr *, struct ucred *);
 int	in_pcbbind_setup(struct inpcb *, struct sockaddr *, in_addr_t *,
 	    u_short *, struct ucred *);
 int	in_pcbconnect(struct inpcb *, struct sockaddr *, struct ucred *);
 int	in_pcbconnect_setup(struct inpcb *, struct sockaddr *, in_addr_t *,
 	    u_short *, in_addr_t *, u_short *, struct inpcb **,
 	    struct ucred *);
 void	in_pcbdetach(struct inpcb *);
 void	in_pcbdisconnect(struct inpcb *);
 void	in_pcbdrop(struct inpcb *);
 void	in_pcbfree(struct inpcb *);
 int	in_pcbinshash(struct inpcb *);
 struct inpcb *
 	in_pcblookup_local(struct inpcbinfo *,
 	    struct in_addr, u_short, int, struct ucred *);
 struct inpcb *
 	in_pcblookup_hash(struct inpcbinfo *, struct in_addr, u_int,
 	    struct in_addr, u_int, int, struct ifnet *);
 void	in_pcbnotifyall(struct inpcbinfo *pcbinfo, struct in_addr,
 	    int, struct inpcb *(*)(struct inpcb *, int));
 void	in_pcbref(struct inpcb *);
 void	in_pcbrehash(struct inpcb *);
 int	in_pcbrele(struct inpcb *);
 void	in_pcbsetsolabel(struct socket *so);
 int	in_getpeeraddr(struct socket *so, struct sockaddr **nam);
 int	in_getsockaddr(struct socket *so, struct sockaddr **nam);
 struct sockaddr *
 	in_sockaddr(in_port_t port, struct in_addr *addr);
 void	in_pcbsosetlabel(struct socket *so);
 void	in_pcbremlists(struct inpcb *inp);
 void	ipport_tick(void *xtp);
 
 /*
  * Debugging routines compiled in when DDB is present.
  */
 void	db_print_inpcb(struct inpcb *inp, const char *name, int indent);
 
 #endif /* _KERNEL */
 
 #endif /* !_NETINET_IN_PCB_H_ */
Index: head/sys/netinet/tcp_hostcache.c
===================================================================
--- head/sys/netinet/tcp_hostcache.c	(revision 186221)
+++ head/sys/netinet/tcp_hostcache.c	(revision 186222)
@@ -1,665 +1,665 @@
 /*-
  * Copyright (c) 2002 Andre Oppermann, Internet Business Solutions AG
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. The name of the author may not be used to endorse or promote
  *    products derived from this software without specific prior written
  *    permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  * The tcp_hostcache moves the tcp-specific cached metrics from the routing
  * table to a dedicated structure indexed by the remote IP address.  It keeps
  * information on the measured TCP parameters of past TCP sessions to allow
  * better initial start values to be used with later connections to/from the
  * same source.  Depending on the network parameters (delay, bandwidth, max
  * MTU, congestion window) between local and remote sites, this can lead to
  * significant speed-ups for new TCP connections after the first one.
  *
  * Due to the tcp_hostcache, all TCP-specific metrics information in the
  * routing table have been removed.  The inpcb no longer keeps a pointer to
  * the routing entry, and protocol-initiated route cloning has been removed
  * as well.  With these changes, the routing table has gone back to being
  * more lightwight and only carries information related to packet forwarding.
  *
  * tcp_hostcache is designed for multiple concurrent access in SMP
  * environments and high contention.  All bucket rows have their own lock and
  * thus multiple lookups and modifies can be done at the same time as long as
  * they are in different bucket rows.  If a request for insertion of a new
  * record can't be satisfied, it simply returns an empty structure.  Nobody
  * and nothing outside of tcp_hostcache.c will ever point directly to any
  * entry in the tcp_hostcache.  All communication is done in an
  * object-oriented way and only functions of tcp_hostcache will manipulate
  * hostcache entries.  Otherwise, we are unable to achieve good behaviour in
  * concurrent access situations.  Since tcp_hostcache is only caching
  * information, there are no fatal consequences if we either can't satisfy
  * any particular request or have to drop/overwrite an existing entry because
  * of bucket limit memory constrains.
  */
 
 /*
  * Many thanks to jlemon for basic structure of tcp_syncache which is being
  * followed here.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet6.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/malloc.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
 #include <sys/vimage.h>
 
 #include <net/if.h>
 
 #include <netinet/in.h>
 #include <netinet/in_systm.h>
 #include <netinet/ip.h>
 #include <netinet/in_var.h>
 #include <netinet/in_pcb.h>
 #include <netinet/ip_var.h>
 #ifdef INET6
 #include <netinet/ip6.h>
 #include <netinet6/ip6_var.h>
 #endif
 #include <netinet/tcp.h>
 #include <netinet/tcp_var.h>
 #include <netinet/tcp_hostcache.h>
 #include <netinet/vinet.h>
 #ifdef INET6
 #include <netinet6/tcp6_var.h>
 #endif
 
 #include <vm/uma.h>
 
 /* Arbitrary values */
 #define TCP_HOSTCACHE_HASHSIZE		512
 #define TCP_HOSTCACHE_BUCKETLIMIT	30
 #define TCP_HOSTCACHE_EXPIRE		60*60	/* one hour */
 #define TCP_HOSTCACHE_PRUNE		5*60	/* every 5 minutes */
 
 #ifdef VIMAGE_GLOBALS
 static struct tcp_hostcache tcp_hostcache;
 static struct callout tcp_hc_callout;
 #endif
 
 static struct hc_metrics *tcp_hc_lookup(struct in_conninfo *);
 static struct hc_metrics *tcp_hc_insert(struct in_conninfo *);
 static int sysctl_tcp_hc_list(SYSCTL_HANDLER_ARGS);
 static void tcp_hc_purge(void *);
 
 SYSCTL_NODE(_net_inet_tcp, OID_AUTO, hostcache, CTLFLAG_RW, 0,
     "TCP Host cache");
 
 SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_hostcache, OID_AUTO, cachelimit,
     CTLFLAG_RDTUN, tcp_hostcache.cache_limit, 0,
     "Overall entry limit for hostcache");
 
 SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_hostcache, OID_AUTO, hashsize,
     CTLFLAG_RDTUN, tcp_hostcache.hashsize, 0,
     "Size of TCP hostcache hashtable");
 
 SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_hostcache, OID_AUTO, bucketlimit,
     CTLFLAG_RDTUN, tcp_hostcache.bucket_limit, 0,
     "Per-bucket hash limit for hostcache");
 
 SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_hostcache, OID_AUTO, count,
     CTLFLAG_RD, tcp_hostcache.cache_count, 0,
     "Current number of entries in hostcache");
 
 SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_hostcache, OID_AUTO, expire,
     CTLFLAG_RW, tcp_hostcache.expire, 0,
     "Expire time of TCP hostcache entries");
 
 SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_hostcache, OID_AUTO, prune,
      CTLFLAG_RW, tcp_hostcache.prune, 0, "Time between purge runs");
 
 SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_hostcache, OID_AUTO, purge,
     CTLFLAG_RW, tcp_hostcache.purgeall, 0,
     "Expire all entires on next purge run");
 
 SYSCTL_PROC(_net_inet_tcp_hostcache, OID_AUTO, list,
     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_SKIP, 0, 0,
     sysctl_tcp_hc_list, "A", "List of all hostcache entries");
 
 
 static MALLOC_DEFINE(M_HOSTCACHE, "hostcache", "TCP hostcache");
 
 #define HOSTCACHE_HASH(ip) \
 	(((ip)->s_addr ^ ((ip)->s_addr >> 7) ^ ((ip)->s_addr >> 17)) &	\
 	  V_tcp_hostcache.hashmask)
 
 /* XXX: What is the recommended hash to get good entropy for IPv6 addresses? */
 #define HOSTCACHE_HASH6(ip6)				\
 	(((ip6)->s6_addr32[0] ^				\
 	  (ip6)->s6_addr32[1] ^				\
 	  (ip6)->s6_addr32[2] ^				\
 	  (ip6)->s6_addr32[3]) &			\
 	 V_tcp_hostcache.hashmask)
 
 #define THC_LOCK(lp)		mtx_lock(lp)
 #define THC_UNLOCK(lp)		mtx_unlock(lp)
 
 void
 tcp_hc_init(void)
 {
 	INIT_VNET_INET(curvnet);
 	int i;
 
 	/*
 	 * Initialize hostcache structures.
 	 */
 	V_tcp_hostcache.cache_count = 0;
 	V_tcp_hostcache.hashsize = TCP_HOSTCACHE_HASHSIZE;
 	V_tcp_hostcache.bucket_limit = TCP_HOSTCACHE_BUCKETLIMIT;
 	V_tcp_hostcache.cache_limit =
 	    V_tcp_hostcache.hashsize * V_tcp_hostcache.bucket_limit;
 	V_tcp_hostcache.expire = TCP_HOSTCACHE_EXPIRE;
 	V_tcp_hostcache.prune = TCP_HOSTCACHE_PRUNE;
 
 	TUNABLE_INT_FETCH("net.inet.tcp.hostcache.hashsize",
 	    &V_tcp_hostcache.hashsize);
 	TUNABLE_INT_FETCH("net.inet.tcp.hostcache.cachelimit",
 	    &V_tcp_hostcache.cache_limit);
 	TUNABLE_INT_FETCH("net.inet.tcp.hostcache.bucketlimit",
 	    &V_tcp_hostcache.bucket_limit);
 	if (!powerof2(V_tcp_hostcache.hashsize)) {
 		printf("WARNING: hostcache hash size is not a power of 2.\n");
 		V_tcp_hostcache.hashsize = TCP_HOSTCACHE_HASHSIZE; /* default */
 	}
 	V_tcp_hostcache.hashmask = V_tcp_hostcache.hashsize - 1;
 
 	/*
 	 * Allocate the hash table.
 	 */
 	V_tcp_hostcache.hashbase = (struct hc_head *)
 	    malloc(V_tcp_hostcache.hashsize * sizeof(struct hc_head),
 		   M_HOSTCACHE, M_WAITOK | M_ZERO);
 
 	/*
 	 * Initialize the hash buckets.
 	 */
 	for (i = 0; i < V_tcp_hostcache.hashsize; i++) {
 		TAILQ_INIT(&V_tcp_hostcache.hashbase[i].hch_bucket);
 		V_tcp_hostcache.hashbase[i].hch_length = 0;
 		mtx_init(&V_tcp_hostcache.hashbase[i].hch_mtx, "tcp_hc_entry",
 			  NULL, MTX_DEF);
 	}
 
 	/*
 	 * Allocate the hostcache entries.
 	 */
 	V_tcp_hostcache.zone =
 	    uma_zcreate("hostcache", sizeof(struct hc_metrics),
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 	uma_zone_set_max(V_tcp_hostcache.zone, V_tcp_hostcache.cache_limit);
 
 	/*
 	 * Set up periodic cache cleanup.
 	 */
 	callout_init(&V_tcp_hc_callout, CALLOUT_MPSAFE);
 	callout_reset(&V_tcp_hc_callout, V_tcp_hostcache.prune * hz,
 	    tcp_hc_purge, 0);
 }
 
 /*
  * Internal function: look up an entry in the hostcache or return NULL.
  *
  * If an entry has been returned, the caller becomes responsible for
  * unlocking the bucket row after he is done reading/modifying the entry.
  */
 static struct hc_metrics *
 tcp_hc_lookup(struct in_conninfo *inc)
 {
 	INIT_VNET_INET(curvnet);
 	int hash;
 	struct hc_head *hc_head;
 	struct hc_metrics *hc_entry;
 
 	KASSERT(inc != NULL, ("tcp_hc_lookup with NULL in_conninfo pointer"));
 
 	/*
 	 * Hash the foreign ip address.
 	 */
-	if (inc->inc_isipv6)
+	if (inc->inc_flags & INC_ISIPV6)
 		hash = HOSTCACHE_HASH6(&inc->inc6_faddr);
 	else
 		hash = HOSTCACHE_HASH(&inc->inc_faddr);
 
 	hc_head = &V_tcp_hostcache.hashbase[hash];
 
 	/*
 	 * Acquire lock for this bucket row; we release the lock if we don't
 	 * find an entry, otherwise the caller has to unlock after he is
 	 * done.
 	 */
 	THC_LOCK(&hc_head->hch_mtx);
 
 	/*
 	 * Iterate through entries in bucket row looking for a match.
 	 */
 	TAILQ_FOREACH(hc_entry, &hc_head->hch_bucket, rmx_q) {
-		if (inc->inc_isipv6) {
+		if (inc->inc_flags & INC_ISIPV6) {
 			if (memcmp(&inc->inc6_faddr, &hc_entry->ip6,
 			    sizeof(inc->inc6_faddr)) == 0)
 				return hc_entry;
 		} else {
 			if (memcmp(&inc->inc_faddr, &hc_entry->ip4,
 			    sizeof(inc->inc_faddr)) == 0)
 				return hc_entry;
 		}
 	}
 
 	/*
 	 * We were unsuccessful and didn't find anything.
 	 */
 	THC_UNLOCK(&hc_head->hch_mtx);
 	return NULL;
 }
 
 /*
  * Internal function: insert an entry into the hostcache or return NULL if
  * unable to allocate a new one.
  *
  * If an entry has been returned, the caller becomes responsible for
  * unlocking the bucket row after he is done reading/modifying the entry.
  */
 static struct hc_metrics *
 tcp_hc_insert(struct in_conninfo *inc)
 {
 	INIT_VNET_INET(curvnet);
 	int hash;
 	struct hc_head *hc_head;
 	struct hc_metrics *hc_entry;
 
 	KASSERT(inc != NULL, ("tcp_hc_insert with NULL in_conninfo pointer"));
 
 	/*
 	 * Hash the foreign ip address.
 	 */
-	if (inc->inc_isipv6)
+	if (inc->inc_flags & INC_ISIPV6)
 		hash = HOSTCACHE_HASH6(&inc->inc6_faddr);
 	else
 		hash = HOSTCACHE_HASH(&inc->inc_faddr);
 
 	hc_head = &V_tcp_hostcache.hashbase[hash];
 
 	/*
 	 * Acquire lock for this bucket row; we release the lock if we don't
 	 * find an entry, otherwise the caller has to unlock after he is
 	 * done.
 	 */
 	THC_LOCK(&hc_head->hch_mtx);
 
 	/*
 	 * If the bucket limit is reached, reuse the least-used element.
 	 */
 	if (hc_head->hch_length >= V_tcp_hostcache.bucket_limit ||
 	    V_tcp_hostcache.cache_count >= V_tcp_hostcache.cache_limit) {
 		hc_entry = TAILQ_LAST(&hc_head->hch_bucket, hc_qhead);
 		/*
 		 * At first we were dropping the last element, just to
 		 * reacquire it in the next two lines again, which isn't very
 		 * efficient.  Instead just reuse the least used element.
 		 * We may drop something that is still "in-use" but we can be
 		 * "lossy".
 		 * Just give up if this bucket row is empty and we don't have
 		 * anything to replace.
 		 */
 		if (hc_entry == NULL) {
 			THC_UNLOCK(&hc_head->hch_mtx);
 			return NULL;
 		}
 		TAILQ_REMOVE(&hc_head->hch_bucket, hc_entry, rmx_q);
 		V_tcp_hostcache.hashbase[hash].hch_length--;
 		V_tcp_hostcache.cache_count--;
 		V_tcpstat.tcps_hc_bucketoverflow++;
 #if 0
 		uma_zfree(V_tcp_hostcache.zone, hc_entry);
 #endif
 	} else {
 		/*
 		 * Allocate a new entry, or balk if not possible.
 		 */
 		hc_entry = uma_zalloc(V_tcp_hostcache.zone, M_NOWAIT);
 		if (hc_entry == NULL) {
 			THC_UNLOCK(&hc_head->hch_mtx);
 			return NULL;
 		}
 	}
 
 	/*
 	 * Initialize basic information of hostcache entry.
 	 */
 	bzero(hc_entry, sizeof(*hc_entry));
-	if (inc->inc_isipv6)
+	if (inc->inc_flags & INC_ISIPV6)
 		bcopy(&inc->inc6_faddr, &hc_entry->ip6, sizeof(hc_entry->ip6));
 	else
 		hc_entry->ip4 = inc->inc_faddr;
 	hc_entry->rmx_head = hc_head;
 	hc_entry->rmx_expire = V_tcp_hostcache.expire;
 
 	/*
 	 * Put it upfront.
 	 */
 	TAILQ_INSERT_HEAD(&hc_head->hch_bucket, hc_entry, rmx_q);
 	V_tcp_hostcache.hashbase[hash].hch_length++;
 	V_tcp_hostcache.cache_count++;
 	V_tcpstat.tcps_hc_added++;
 
 	return hc_entry;
 }
 
 /*
  * External function: look up an entry in the hostcache and fill out the
  * supplied TCP metrics structure.  Fills in NULL when no entry was found or
  * a value is not set.
  */
 void
 tcp_hc_get(struct in_conninfo *inc, struct hc_metrics_lite *hc_metrics_lite)
 {
 	INIT_VNET_INET(curvnet);
 	struct hc_metrics *hc_entry;
 
 	/*
 	 * Find the right bucket.
 	 */
 	hc_entry = tcp_hc_lookup(inc);
 
 	/*
 	 * If we don't have an existing object.
 	 */
 	if (hc_entry == NULL) {
 		bzero(hc_metrics_lite, sizeof(*hc_metrics_lite));
 		return;
 	}
 	hc_entry->rmx_hits++;
 	hc_entry->rmx_expire = V_tcp_hostcache.expire; /* start over again */
 
 	hc_metrics_lite->rmx_mtu = hc_entry->rmx_mtu;
 	hc_metrics_lite->rmx_ssthresh = hc_entry->rmx_ssthresh;
 	hc_metrics_lite->rmx_rtt = hc_entry->rmx_rtt;
 	hc_metrics_lite->rmx_rttvar = hc_entry->rmx_rttvar;
 	hc_metrics_lite->rmx_bandwidth = hc_entry->rmx_bandwidth;
 	hc_metrics_lite->rmx_cwnd = hc_entry->rmx_cwnd;
 	hc_metrics_lite->rmx_sendpipe = hc_entry->rmx_sendpipe;
 	hc_metrics_lite->rmx_recvpipe = hc_entry->rmx_recvpipe;
 
 	/*
 	 * Unlock bucket row.
 	 */
 	THC_UNLOCK(&hc_entry->rmx_head->hch_mtx);
 }
 
 /*
  * External function: look up an entry in the hostcache and return the
  * discovered path MTU.  Returns NULL if no entry is found or value is not
  * set.
  */
 u_long
 tcp_hc_getmtu(struct in_conninfo *inc)
 {
 	INIT_VNET_INET(curvnet);
 	struct hc_metrics *hc_entry;
 	u_long mtu;
 
 	hc_entry = tcp_hc_lookup(inc);
 	if (hc_entry == NULL) {
 		return 0;
 	}
 	hc_entry->rmx_hits++;
 	hc_entry->rmx_expire = V_tcp_hostcache.expire; /* start over again */
 
 	mtu = hc_entry->rmx_mtu;
 	THC_UNLOCK(&hc_entry->rmx_head->hch_mtx);
 	return mtu;
 }
 
 /*
  * External function: update the MTU value of an entry in the hostcache.
  * Creates a new entry if none was found.
  */
 void
 tcp_hc_updatemtu(struct in_conninfo *inc, u_long mtu)
 {
 	INIT_VNET_INET(curvnet);
 	struct hc_metrics *hc_entry;
 
 	/*
 	 * Find the right bucket.
 	 */
 	hc_entry = tcp_hc_lookup(inc);
 
 	/*
 	 * If we don't have an existing object, try to insert a new one.
 	 */
 	if (hc_entry == NULL) {
 		hc_entry = tcp_hc_insert(inc);
 		if (hc_entry == NULL)
 			return;
 	}
 	hc_entry->rmx_updates++;
 	hc_entry->rmx_expire = V_tcp_hostcache.expire; /* start over again */
 
 	hc_entry->rmx_mtu = mtu;
 
 	/*
 	 * Put it upfront so we find it faster next time.
 	 */
 	TAILQ_REMOVE(&hc_entry->rmx_head->hch_bucket, hc_entry, rmx_q);
 	TAILQ_INSERT_HEAD(&hc_entry->rmx_head->hch_bucket, hc_entry, rmx_q);
 
 	/*
 	 * Unlock bucket row.
 	 */
 	THC_UNLOCK(&hc_entry->rmx_head->hch_mtx);
 }
 
 /*
  * External function: update the TCP metrics of an entry in the hostcache.
  * Creates a new entry if none was found.
  */
 void
 tcp_hc_update(struct in_conninfo *inc, struct hc_metrics_lite *hcml)
 {
 	INIT_VNET_INET(curvnet);
 	struct hc_metrics *hc_entry;
 
 	hc_entry = tcp_hc_lookup(inc);
 	if (hc_entry == NULL) {
 		hc_entry = tcp_hc_insert(inc);
 		if (hc_entry == NULL)
 			return;
 	}
 	hc_entry->rmx_updates++;
 	hc_entry->rmx_expire = V_tcp_hostcache.expire; /* start over again */
 
 	if (hcml->rmx_rtt != 0) {
 		if (hc_entry->rmx_rtt == 0)
 			hc_entry->rmx_rtt = hcml->rmx_rtt;
 		else
 			hc_entry->rmx_rtt =
 			    (hc_entry->rmx_rtt + hcml->rmx_rtt) / 2;
 		V_tcpstat.tcps_cachedrtt++;
 	}
 	if (hcml->rmx_rttvar != 0) {
 	        if (hc_entry->rmx_rttvar == 0)
 			hc_entry->rmx_rttvar = hcml->rmx_rttvar;
 		else
 			hc_entry->rmx_rttvar =
 			    (hc_entry->rmx_rttvar + hcml->rmx_rttvar) / 2;
 		V_tcpstat.tcps_cachedrttvar++;
 	}
 	if (hcml->rmx_ssthresh != 0) {
 		if (hc_entry->rmx_ssthresh == 0)
 			hc_entry->rmx_ssthresh = hcml->rmx_ssthresh;
 		else
 			hc_entry->rmx_ssthresh =
 			    (hc_entry->rmx_ssthresh + hcml->rmx_ssthresh) / 2;
 		V_tcpstat.tcps_cachedssthresh++;
 	}
 	if (hcml->rmx_bandwidth != 0) {
 		if (hc_entry->rmx_bandwidth == 0)
 			hc_entry->rmx_bandwidth = hcml->rmx_bandwidth;
 		else
 			hc_entry->rmx_bandwidth =
 			    (hc_entry->rmx_bandwidth + hcml->rmx_bandwidth) / 2;
 		/* V_tcpstat.tcps_cachedbandwidth++; */
 	}
 	if (hcml->rmx_cwnd != 0) {
 		if (hc_entry->rmx_cwnd == 0)
 			hc_entry->rmx_cwnd = hcml->rmx_cwnd;
 		else
 			hc_entry->rmx_cwnd =
 			    (hc_entry->rmx_cwnd + hcml->rmx_cwnd) / 2;
 		/* V_tcpstat.tcps_cachedcwnd++; */
 	}
 	if (hcml->rmx_sendpipe != 0) {
 		if (hc_entry->rmx_sendpipe == 0)
 			hc_entry->rmx_sendpipe = hcml->rmx_sendpipe;
 		else
 			hc_entry->rmx_sendpipe =
 			    (hc_entry->rmx_sendpipe + hcml->rmx_sendpipe) /2;
 		/* V_tcpstat.tcps_cachedsendpipe++; */
 	}
 	if (hcml->rmx_recvpipe != 0) {
 		if (hc_entry->rmx_recvpipe == 0)
 			hc_entry->rmx_recvpipe = hcml->rmx_recvpipe;
 		else
 			hc_entry->rmx_recvpipe =
 			    (hc_entry->rmx_recvpipe + hcml->rmx_recvpipe) /2;
 		/* V_tcpstat.tcps_cachedrecvpipe++; */
 	}
 
 	TAILQ_REMOVE(&hc_entry->rmx_head->hch_bucket, hc_entry, rmx_q);
 	TAILQ_INSERT_HEAD(&hc_entry->rmx_head->hch_bucket, hc_entry, rmx_q);
 	THC_UNLOCK(&hc_entry->rmx_head->hch_mtx);
 }
 
 /*
  * Sysctl function: prints the list and values of all hostcache entries in
  * unsorted order.
  */
 static int
 sysctl_tcp_hc_list(SYSCTL_HANDLER_ARGS)
 {
 	INIT_VNET_INET(curvnet);
 	int bufsize;
 	int linesize = 128;
 	char *p, *buf;
 	int len, i, error;
 	struct hc_metrics *hc_entry;
 #ifdef INET6
 	char ip6buf[INET6_ADDRSTRLEN];
 #endif
 
 	bufsize = linesize * (V_tcp_hostcache.cache_count + 1);
 
 	p = buf = (char *)malloc(bufsize, M_TEMP, M_WAITOK|M_ZERO);
 
 	len = snprintf(p, linesize,
 		"\nIP address        MTU  SSTRESH      RTT   RTTVAR BANDWIDTH "
 		"    CWND SENDPIPE RECVPIPE HITS  UPD  EXP\n");
 	p += len;
 
 #define msec(u) (((u) + 500) / 1000)
 	for (i = 0; i < V_tcp_hostcache.hashsize; i++) {
 		THC_LOCK(&V_tcp_hostcache.hashbase[i].hch_mtx);
 		TAILQ_FOREACH(hc_entry, &V_tcp_hostcache.hashbase[i].hch_bucket,
 			      rmx_q) {
 			len = snprintf(p, linesize,
 			    "%-15s %5lu %8lu %6lums %6lums %9lu %8lu %8lu %8lu "
 			    "%4lu %4lu %4i\n",
 			    hc_entry->ip4.s_addr ? inet_ntoa(hc_entry->ip4) :
 #ifdef INET6
 				ip6_sprintf(ip6buf, &hc_entry->ip6),
 #else
 				"IPv6?",
 #endif
 			    hc_entry->rmx_mtu,
 			    hc_entry->rmx_ssthresh,
 			    msec(hc_entry->rmx_rtt *
 				(RTM_RTTUNIT / (hz * TCP_RTT_SCALE))),
 			    msec(hc_entry->rmx_rttvar *
 				(RTM_RTTUNIT / (hz * TCP_RTT_SCALE))),
 			    hc_entry->rmx_bandwidth * 8,
 			    hc_entry->rmx_cwnd,
 			    hc_entry->rmx_sendpipe,
 			    hc_entry->rmx_recvpipe,
 			    hc_entry->rmx_hits,
 			    hc_entry->rmx_updates,
 			    hc_entry->rmx_expire);
 			p += len;
 		}
 		THC_UNLOCK(&V_tcp_hostcache.hashbase[i].hch_mtx);
 	}
 #undef msec
 	error = SYSCTL_OUT(req, buf, p - buf);
 	free(buf, M_TEMP);
 	return(error);
 }
 
 /*
  * Expire and purge (old|all) entries in the tcp_hostcache.  Runs
  * periodically from the callout.
  */
 static void
 tcp_hc_purge(void *arg)
 {
 	INIT_VNET_INET(curvnet);
 	struct hc_metrics *hc_entry, *hc_next;
 	int all = (intptr_t)arg;
 	int i;
 
 	if (V_tcp_hostcache.purgeall) {
 		all = 1;
 		V_tcp_hostcache.purgeall = 0;
 	}
 
 	for (i = 0; i < V_tcp_hostcache.hashsize; i++) {
 		THC_LOCK(&V_tcp_hostcache.hashbase[i].hch_mtx);
 		TAILQ_FOREACH_SAFE(hc_entry,
 		    &V_tcp_hostcache.hashbase[i].hch_bucket, rmx_q, hc_next) {
 			if (all || hc_entry->rmx_expire <= 0) {
 				TAILQ_REMOVE(&V_tcp_hostcache.hashbase[i].hch_bucket,
 					      hc_entry, rmx_q);
 				uma_zfree(V_tcp_hostcache.zone, hc_entry);
 				V_tcp_hostcache.hashbase[i].hch_length--;
 				V_tcp_hostcache.cache_count--;
 			} else
 				hc_entry->rmx_expire -= V_tcp_hostcache.prune;
 		}
 		THC_UNLOCK(&V_tcp_hostcache.hashbase[i].hch_mtx);
 	}
 
 	callout_reset(&V_tcp_hc_callout, V_tcp_hostcache.prune * hz,
 	    tcp_hc_purge, arg);
 }
Index: head/sys/netinet/tcp_input.c
===================================================================
--- head/sys/netinet/tcp_input.c	(revision 186221)
+++ head/sys/netinet/tcp_input.c	(revision 186222)
@@ -1,3406 +1,3403 @@
 /*-
  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)tcp_input.c	8.12 (Berkeley) 5/24/95
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ipfw.h"		/* for ipfw_fwd	*/
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_ipsec.h"
 #include "opt_mac.h"
 #include "opt_tcpdebug.h"
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/proc.h>		/* for proc0 declaration */
 #include <sys/protosw.h>
 #include <sys/signalvar.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 #include <sys/systm.h>
 #include <sys/vimage.h>
 
 #include <machine/cpu.h>	/* before tcp_seq.h, for tcp_random18() */
 
 #include <vm/uma.h>
 
 #include <net/if.h>
 #include <net/route.h>
 
 #define TCPSTATES		/* for logging */
 
 #include <netinet/in.h>
 #include <netinet/in_pcb.h>
 #include <netinet/in_systm.h>
 #include <netinet/in_var.h>
 #include <netinet/ip.h>
 #include <netinet/ip_icmp.h>	/* required for icmp_var.h */
 #include <netinet/icmp_var.h>	/* for ICMP_BANDLIM */
 #include <netinet/ip_var.h>
 #include <netinet/ip_options.h>
 #include <netinet/ip6.h>
 #include <netinet/icmp6.h>
 #include <netinet6/in6_pcb.h>
 #include <netinet6/ip6_var.h>
 #include <netinet6/nd6.h>
 #include <netinet/tcp.h>
 #include <netinet/tcp_fsm.h>
 #include <netinet/tcp_seq.h>
 #include <netinet/tcp_timer.h>
 #include <netinet/tcp_var.h>
 #include <netinet6/tcp6_var.h>
 #include <netinet/tcpip.h>
 #include <netinet/tcp_syncache.h>
 #ifdef TCPDEBUG
 #include <netinet/tcp_debug.h>
 #endif /* TCPDEBUG */
 #include <netinet/vinet.h>
 
 #ifdef INET6
 #include <netinet6/vinet6.h>
 #endif
 
 #ifdef IPSEC
 #include <netipsec/ipsec.h>
 #include <netipsec/ipsec6.h>
 #endif /*IPSEC*/
 
 #include <machine/in_cksum.h>
 
 #include <security/mac/mac_framework.h>
 
 static const int tcprexmtthresh = 3;
 
 #ifdef VIMAGE_GLOBALS
 struct	tcpstat tcpstat;
 int	blackhole;
 int	tcp_delack_enabled;
 int	drop_synfin;
 int	tcp_do_rfc3042;
 int	tcp_do_rfc3390;
 int	tcp_do_ecn;
 int	tcp_ecn_maxretries;
 int	tcp_insecure_rst;
 int	tcp_do_autorcvbuf;
 int	tcp_autorcvbuf_inc;
 int	tcp_autorcvbuf_max;
 #endif
 
 SYSCTL_V_STRUCT(V_NET, vnet_inet, _net_inet_tcp, TCPCTL_STATS, stats,
     CTLFLAG_RW, tcpstat , tcpstat,
     "TCP statistics (struct tcpstat, netinet/tcp_var.h)");
 
 int tcp_log_in_vain = 0;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_in_vain, CTLFLAG_RW,
     &tcp_log_in_vain, 0, "Log all incoming TCP segments to closed ports");
 
 SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, blackhole, CTLFLAG_RW,
     blackhole, 0, "Do not send RST on segments to closed ports");
 
 SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, delayed_ack,
     CTLFLAG_RW, tcp_delack_enabled, 0,
     "Delay ACK to try and piggyback it onto a data packet");
 
 SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, drop_synfin,
     CTLFLAG_RW, drop_synfin, 0, "Drop TCP packets with SYN+FIN set");
 
 SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, rfc3042, CTLFLAG_RW,
     tcp_do_rfc3042, 0, "Enable RFC 3042 (Limited Transmit)");
 
 SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, rfc3390, CTLFLAG_RW,
     tcp_do_rfc3390, 0,
     "Enable RFC 3390 (Increasing TCP's Initial Congestion Window)");
 
 SYSCTL_NODE(_net_inet_tcp, OID_AUTO, ecn, CTLFLAG_RW, 0, "TCP ECN");
 SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_ecn, OID_AUTO, enable,
     CTLFLAG_RW, tcp_do_ecn, 0, "TCP ECN support");
 SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_ecn, OID_AUTO, maxretries,
     CTLFLAG_RW, tcp_ecn_maxretries, 0, "Max retries before giving up on ECN");
 
 SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, insecure_rst,
     CTLFLAG_RW, tcp_insecure_rst, 0,
     "Follow the old (insecure) criteria for accepting RST packets");
 
 SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, recvbuf_auto,
     CTLFLAG_RW, tcp_do_autorcvbuf, 0,
     "Enable automatic receive buffer sizing");
 
 SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, recvbuf_inc,
     CTLFLAG_RW, tcp_autorcvbuf_inc, 0,
     "Incrementor step size of automatic receive buffer");
 
 SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, recvbuf_max,
     CTLFLAG_RW, tcp_autorcvbuf_max, 0,
     "Max size of automatic receive buffer");
 
 int	tcp_read_locking = 1;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, read_locking, CTLFLAG_RW,
     &tcp_read_locking, 0, "Enable read locking strategy");
 
 int	tcp_rlock_atfirst;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, rlock_atfirst, CTLFLAG_RD,
     &tcp_rlock_atfirst, 0, "");
 
 int	tcp_wlock_atfirst;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_wlock_atfirst, CTLFLAG_RD,
     &tcp_wlock_atfirst, 0, "");
 
 int	tcp_wlock_upgraded;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, wlock_upgraded, CTLFLAG_RD,
     &tcp_wlock_upgraded, 0, "");
 
 int	tcp_wlock_relocked;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, wlock_relocked, CTLFLAG_RD,
     &tcp_wlock_relocked, 0, "");
 
 int	tcp_wlock_looped;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, wlock_looped, CTLFLAG_RD,
     &tcp_wlock_looped, 0, "");
 
 #ifdef VIMAGE_GLOBALS
 struct inpcbhead tcb;
 struct inpcbinfo tcbinfo;
 #endif
 #define	tcb6	tcb  /* for KAME src sync over BSD*'s */
 
 static void	 tcp_dooptions(struct tcpopt *, u_char *, int, int);
 static void	 tcp_do_segment(struct mbuf *, struct tcphdr *,
 		     struct socket *, struct tcpcb *, int, int, uint8_t,
 		     int);
 static void	 tcp_dropwithreset(struct mbuf *, struct tcphdr *,
 		     struct tcpcb *, int, int);
 static void	 tcp_pulloutofband(struct socket *,
 		     struct tcphdr *, struct mbuf *, int);
 static void	 tcp_xmit_timer(struct tcpcb *, int);
 static void	 tcp_newreno_partial_ack(struct tcpcb *, struct tcphdr *);
 static void inline
 		 tcp_congestion_exp(struct tcpcb *);
 
 static void inline
 tcp_congestion_exp(struct tcpcb *tp)
 {
 	u_int win;
 	
 	win = min(tp->snd_wnd, tp->snd_cwnd) /
 	    2 / tp->t_maxseg;
 	if (win < 2)
 		win = 2;
 	tp->snd_ssthresh = win * tp->t_maxseg;
 	ENTER_FASTRECOVERY(tp);
 	tp->snd_recover = tp->snd_max;
 	if (tp->t_flags & TF_ECN_PERMIT)
 		tp->t_flags |= TF_ECN_SND_CWR;
 }
 
 /* Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint. */
 #ifdef INET6
 #define ND6_HINT(tp) \
 do { \
 	if ((tp) && (tp)->t_inpcb && \
 	    ((tp)->t_inpcb->inp_vflag & INP_IPV6) != 0) \
 		nd6_nud_hint(NULL, NULL, 0); \
 } while (0)
 #else
 #define ND6_HINT(tp)
 #endif
 
 /*
  * Indicate whether this ack should be delayed.  We can delay the ack if
  *	- there is no delayed ack timer in progress and
  *	- our last ack wasn't a 0-sized window.  We never want to delay
  *	  the ack that opens up a 0-sized window and
  *		- delayed acks are enabled or
  *		- this is a half-synchronized T/TCP connection.
  */
 #define DELAY_ACK(tp)							\
 	((!tcp_timer_active(tp, TT_DELACK) &&				\
 	    (tp->t_flags & TF_RXWIN0SENT) == 0) &&			\
 	    (V_tcp_delack_enabled || (tp->t_flags & TF_NEEDSYN)))
 
 /*
  * TCP input handling is split into multiple parts:
  *   tcp6_input is a thin wrapper around tcp_input for the extended
  *	ip6_protox[] call format in ip6_input
  *   tcp_input handles primary segment validation, inpcb lookup and
  *	SYN processing on listen sockets
  *   tcp_do_segment processes the ACK and text of the segment for
  *	establishing, established and closing connections
  */
 #ifdef INET6
 int
 tcp6_input(struct mbuf **mp, int *offp, int proto)
 {
 	INIT_VNET_INET6(curvnet);
 	struct mbuf *m = *mp;
 	struct in6_ifaddr *ia6;
 
 	IP6_EXTHDR_CHECK(m, *offp, sizeof(struct tcphdr), IPPROTO_DONE);
 
 	/*
 	 * draft-itojun-ipv6-tcp-to-anycast
 	 * better place to put this in?
 	 */
 	ia6 = ip6_getdstifaddr(m);
 	if (ia6 && (ia6->ia6_flags & IN6_IFF_ANYCAST)) {
 		struct ip6_hdr *ip6;
 
 		ip6 = mtod(m, struct ip6_hdr *);
 		icmp6_error(m, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_ADDR,
 			    (caddr_t)&ip6->ip6_dst - (caddr_t)ip6);
 		return IPPROTO_DONE;
 	}
 
 	tcp_input(m, *offp);
 	return IPPROTO_DONE;
 }
 #endif
 
 void
 tcp_input(struct mbuf *m, int off0)
 {
 	INIT_VNET_INET(curvnet);
 #ifdef INET6
 	INIT_VNET_INET6(curvnet);
 #endif
 #ifdef IPSEC
 	INIT_VNET_IPSEC(curvnet);
 #endif
 	struct tcphdr *th;
 	struct ip *ip = NULL;
 	struct ipovly *ipov;
 	struct inpcb *inp = NULL;
 	struct tcpcb *tp = NULL;
 	struct socket *so = NULL;
 	u_char *optp = NULL;
 	int optlen = 0;
 	int len, tlen, off;
 	int drop_hdrlen;
 	int thflags;
 	int rstreason = 0;	/* For badport_bandlim accounting purposes */
 	uint8_t iptos;
 #ifdef IPFIREWALL_FORWARD
 	struct m_tag *fwd_tag;
 #endif
 #ifdef INET6
 	struct ip6_hdr *ip6 = NULL;
 	int isipv6;
 #else
 	const void *ip6 = NULL;
 	const int isipv6 = 0;
 #endif
 	struct tcpopt to;		/* options in this segment */
 	char *s = NULL;			/* address and port logging */
 	int ti_locked;
 #define	TI_UNLOCKED	1
 #define	TI_RLOCKED	2
 #define	TI_WLOCKED	3
 
 #ifdef TCPDEBUG
 	/*
 	 * The size of tcp_saveipgen must be the size of the max ip header,
 	 * now IPv6.
 	 */
 	u_char tcp_saveipgen[IP6_HDR_LEN];
 	struct tcphdr tcp_savetcp;
 	short ostate = 0;
 #endif
 
 #ifdef INET6
 	isipv6 = (mtod(m, struct ip *)->ip_v == 6) ? 1 : 0;
 #endif
 
 	to.to_flags = 0;
 	V_tcpstat.tcps_rcvtotal++;
 
 	if (isipv6) {
 #ifdef INET6
 		/* IP6_EXTHDR_CHECK() is already done at tcp6_input(). */
 		ip6 = mtod(m, struct ip6_hdr *);
 		tlen = sizeof(*ip6) + ntohs(ip6->ip6_plen) - off0;
 		if (in6_cksum(m, IPPROTO_TCP, off0, tlen)) {
 			V_tcpstat.tcps_rcvbadsum++;
 			goto drop;
 		}
 		th = (struct tcphdr *)((caddr_t)ip6 + off0);
 
 		/*
 		 * Be proactive about unspecified IPv6 address in source.
 		 * As we use all-zero to indicate unbounded/unconnected pcb,
 		 * unspecified IPv6 address can be used to confuse us.
 		 *
 		 * Note that packets with unspecified IPv6 destination is
 		 * already dropped in ip6_input.
 		 */
 		if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) {
 			/* XXX stat */
 			goto drop;
 		}
 #else
 		th = NULL;		/* XXX: Avoid compiler warning. */
 #endif
 	} else {
 		/*
 		 * Get IP and TCP header together in first mbuf.
 		 * Note: IP leaves IP header in first mbuf.
 		 */
 		if (off0 > sizeof (struct ip)) {
 			ip_stripoptions(m, (struct mbuf *)0);
 			off0 = sizeof(struct ip);
 		}
 		if (m->m_len < sizeof (struct tcpiphdr)) {
 			if ((m = m_pullup(m, sizeof (struct tcpiphdr)))
 			    == NULL) {
 				V_tcpstat.tcps_rcvshort++;
 				return;
 			}
 		}
 		ip = mtod(m, struct ip *);
 		ipov = (struct ipovly *)ip;
 		th = (struct tcphdr *)((caddr_t)ip + off0);
 		tlen = ip->ip_len;
 
 		if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) {
 			if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR)
 				th->th_sum = m->m_pkthdr.csum_data;
 			else
 				th->th_sum = in_pseudo(ip->ip_src.s_addr,
 						ip->ip_dst.s_addr,
 						htonl(m->m_pkthdr.csum_data +
 							ip->ip_len +
 							IPPROTO_TCP));
 			th->th_sum ^= 0xffff;
 #ifdef TCPDEBUG
 			ipov->ih_len = (u_short)tlen;
 			ipov->ih_len = htons(ipov->ih_len);
 #endif
 		} else {
 			/*
 			 * Checksum extended TCP header and data.
 			 */
 			len = sizeof (struct ip) + tlen;
 			bzero(ipov->ih_x1, sizeof(ipov->ih_x1));
 			ipov->ih_len = (u_short)tlen;
 			ipov->ih_len = htons(ipov->ih_len);
 			th->th_sum = in_cksum(m, len);
 		}
 		if (th->th_sum) {
 			V_tcpstat.tcps_rcvbadsum++;
 			goto drop;
 		}
 		/* Re-initialization for later version check */
 		ip->ip_v = IPVERSION;
 	}
 
 #ifdef INET6
 	if (isipv6)
 		iptos = (ntohl(ip6->ip6_flow) >> 20) & 0xff;
 	else
 #endif
 		iptos = ip->ip_tos;
 
 	/*
 	 * Check that TCP offset makes sense,
 	 * pull out TCP options and adjust length.		XXX
 	 */
 	off = th->th_off << 2;
 	if (off < sizeof (struct tcphdr) || off > tlen) {
 		V_tcpstat.tcps_rcvbadoff++;
 		goto drop;
 	}
 	tlen -= off;	/* tlen is used instead of ti->ti_len */
 	if (off > sizeof (struct tcphdr)) {
 		if (isipv6) {
 #ifdef INET6
 			IP6_EXTHDR_CHECK(m, off0, off, );
 			ip6 = mtod(m, struct ip6_hdr *);
 			th = (struct tcphdr *)((caddr_t)ip6 + off0);
 #endif
 		} else {
 			if (m->m_len < sizeof(struct ip) + off) {
 				if ((m = m_pullup(m, sizeof (struct ip) + off))
 				    == NULL) {
 					V_tcpstat.tcps_rcvshort++;
 					return;
 				}
 				ip = mtod(m, struct ip *);
 				ipov = (struct ipovly *)ip;
 				th = (struct tcphdr *)((caddr_t)ip + off0);
 			}
 		}
 		optlen = off - sizeof (struct tcphdr);
 		optp = (u_char *)(th + 1);
 	}
 	thflags = th->th_flags;
 
 	/*
 	 * Convert TCP protocol specific fields to host format.
 	 */
 	th->th_seq = ntohl(th->th_seq);
 	th->th_ack = ntohl(th->th_ack);
 	th->th_win = ntohs(th->th_win);
 	th->th_urp = ntohs(th->th_urp);
 
 	/*
 	 * Delay dropping TCP, IP headers, IPv6 ext headers, and TCP options.
 	 */
 	drop_hdrlen = off0 + off;
 
 	/*
 	 * Locate pcb for segment, which requires a lock on tcbinfo.
 	 * Optimisticaly acquire a global read lock rather than a write lock
 	 * unless header flags necessarily imply a state change.  There are
 	 * two cases where we might discover later we need a write lock
 	 * despite the flags: ACKs moving a connection out of the syncache,
 	 * and ACKs for a connection in TIMEWAIT.
 	 */
 	if ((thflags & (TH_SYN | TH_FIN | TH_RST)) != 0 ||
 	    tcp_read_locking == 0) {
 		INP_INFO_WLOCK(&V_tcbinfo);
 		ti_locked = TI_WLOCKED;
 		tcp_wlock_atfirst++;
 	} else {
 		INP_INFO_RLOCK(&V_tcbinfo);
 		ti_locked = TI_RLOCKED;
 		tcp_rlock_atfirst++;
 	}
 
 findpcb:
 #ifdef INVARIANTS
 	if (ti_locked == TI_RLOCKED)
 		INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
 	else if (ti_locked == TI_WLOCKED)
 		INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
 	else
 		panic("%s: findpcb ti_locked %d\n", __func__, ti_locked);
 #endif
 
 #ifdef IPFIREWALL_FORWARD
 	/*
 	 * Grab info from PACKET_TAG_IPFORWARD tag prepended to the chain.
 	 */
 	fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL);
 
 	if (fwd_tag != NULL && isipv6 == 0) {	/* IPv6 support is not yet */
 		struct sockaddr_in *next_hop;
 
 		next_hop = (struct sockaddr_in *)(fwd_tag+1);
 		/*
 		 * Transparently forwarded. Pretend to be the destination.
 		 * already got one like this?
 		 */
 		inp = in_pcblookup_hash(&V_tcbinfo,
 					ip->ip_src, th->th_sport,
 					ip->ip_dst, th->th_dport,
 					0, m->m_pkthdr.rcvif);
 		if (!inp) {
 			/* It's new.  Try to find the ambushing socket. */
 			inp = in_pcblookup_hash(&V_tcbinfo,
 						ip->ip_src, th->th_sport,
 						next_hop->sin_addr,
 						next_hop->sin_port ?
 						    ntohs(next_hop->sin_port) :
 						    th->th_dport,
 						INPLOOKUP_WILDCARD,
 						m->m_pkthdr.rcvif);
 		}
 		/* Remove the tag from the packet.  We don't need it anymore. */
 		m_tag_delete(m, fwd_tag);
 	} else
 #endif /* IPFIREWALL_FORWARD */
 	{
 		if (isipv6) {
 #ifdef INET6
 			inp = in6_pcblookup_hash(&V_tcbinfo,
 						 &ip6->ip6_src, th->th_sport,
 						 &ip6->ip6_dst, th->th_dport,
 						 INPLOOKUP_WILDCARD,
 						 m->m_pkthdr.rcvif);
 #endif
 		} else
 			inp = in_pcblookup_hash(&V_tcbinfo,
 						ip->ip_src, th->th_sport,
 						ip->ip_dst, th->th_dport,
 						INPLOOKUP_WILDCARD,
 						m->m_pkthdr.rcvif);
 	}
 
 	/*
 	 * If the INPCB does not exist then all data in the incoming
 	 * segment is discarded and an appropriate RST is sent back.
 	 * XXX MRT Send RST using which routing table?
 	 */
 	if (inp == NULL) {
 		/*
 		 * Log communication attempts to ports that are not
 		 * in use.
 		 */
 		if ((tcp_log_in_vain == 1 && (thflags & TH_SYN)) ||
 		    tcp_log_in_vain == 2) {
 			if ((s = tcp_log_addrs(NULL, th, (void *)ip, ip6)))
 				log(LOG_INFO, "%s; %s: Connection attempt "
 				    "to closed port\n", s, __func__);
 		}
 		/*
 		 * When blackholing do not respond with a RST but
 		 * completely ignore the segment and drop it.
 		 */
 		if ((V_blackhole == 1 && (thflags & TH_SYN)) ||
 		    V_blackhole == 2)
 			goto dropunlock;
 
 		rstreason = BANDLIM_RST_CLOSEDPORT;
 		goto dropwithreset;
 	}
 	INP_WLOCK(inp);
 
 #ifdef IPSEC
 #ifdef INET6
 	if (isipv6 && ipsec6_in_reject(m, inp)) {
 		V_ipsec6stat.in_polvio++;
 		goto dropunlock;
 	} else
 #endif /* INET6 */
 	if (ipsec4_in_reject(m, inp) != 0) {
 		V_ipsec4stat.in_polvio++;
 		goto dropunlock;
 	}
 #endif /* IPSEC */
 
 	/*
 	 * Check the minimum TTL for socket.
 	 */
 	if (inp->inp_ip_minttl != 0) {
 #ifdef INET6
 		if (isipv6 && inp->inp_ip_minttl > ip6->ip6_hlim)
 			goto dropunlock;
 		else
 #endif
 		if (inp->inp_ip_minttl > ip->ip_ttl)
 			goto dropunlock;
 	}
 
 	/*
 	 * A previous connection in TIMEWAIT state is supposed to catch stray
 	 * or duplicate segments arriving late.  If this segment was a
 	 * legitimate new connection attempt the old INPCB gets removed and
 	 * we can try again to find a listening socket.
 	 *
 	 * At this point, due to earlier optimism, we may hold a read lock on
 	 * the inpcbinfo, rather than a write lock.  If so, we need to
 	 * upgrade, or if that fails, acquire a reference on the inpcb, drop
 	 * all locks, acquire a global write lock, and then re-acquire the
 	 * inpcb lock.  We may at that point discover that another thread has
 	 * tried to free the inpcb, in which case we need to loop back and
 	 * try to find a new inpcb to deliver to.
 	 */
 	if (inp->inp_vflag & INP_TIMEWAIT) {
 		KASSERT(ti_locked == TI_RLOCKED || ti_locked == TI_WLOCKED,
 		    ("%s: INP_TIMEWAIT ti_locked %d", __func__, ti_locked));
 
 		if (ti_locked == TI_RLOCKED) {
 			if (rw_try_upgrade(&V_tcbinfo.ipi_lock) == 0) {
 				in_pcbref(inp);
 				INP_WUNLOCK(inp);
 				INP_INFO_RUNLOCK(&V_tcbinfo);
 				INP_INFO_WLOCK(&V_tcbinfo);
 				ti_locked = TI_WLOCKED;
 				INP_WLOCK(inp);
 				if (in_pcbrele(inp)) {
 					tcp_wlock_looped++;
 					inp = NULL;
 					goto findpcb;
 				}
 				tcp_wlock_relocked++;
 			} else {
 				ti_locked = TI_WLOCKED;
 				tcp_wlock_upgraded++;
 			}
 		}
 		INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
 
 		if (thflags & TH_SYN)
 			tcp_dooptions(&to, optp, optlen, TO_SYN);
 		/*
 		 * NB: tcp_twcheck unlocks the INP and frees the mbuf.
 		 */
 		if (tcp_twcheck(inp, &to, th, m, tlen))
 			goto findpcb;
 		INP_INFO_WUNLOCK(&V_tcbinfo);
 		return;
 	}
 	/*
 	 * The TCPCB may no longer exist if the connection is winding
 	 * down or it is in the CLOSED state.  Either way we drop the
 	 * segment and send an appropriate response.
 	 */
 	tp = intotcpcb(inp);
 	if (tp == NULL || tp->t_state == TCPS_CLOSED) {
 		rstreason = BANDLIM_RST_CLOSEDPORT;
 		goto dropwithreset;
 	}
 
 	/*
 	 * We've identified a valid inpcb, but it could be that we need an
 	 * inpcbinfo write lock and have only a read lock.  In this case,
 	 * attempt to upgrade/relock using the same strategy as the TIMEWAIT
 	 * case above.
 	 */
 	if (tp->t_state != TCPS_ESTABLISHED ||
 	    (thflags & (TH_SYN | TH_FIN | TH_RST)) != 0 ||
 	    tcp_read_locking == 0) {
 		KASSERT(ti_locked == TI_RLOCKED || ti_locked == TI_WLOCKED,
 		    ("%s: upgrade check ti_locked %d", __func__, ti_locked));
 
 		if (ti_locked == TI_RLOCKED) {
 			if (rw_try_upgrade(&V_tcbinfo.ipi_lock) == 0) {
 				in_pcbref(inp);
 				INP_WUNLOCK(inp);
 				INP_INFO_RUNLOCK(&V_tcbinfo);
 				INP_INFO_WLOCK(&V_tcbinfo);
 				ti_locked = TI_WLOCKED;
 				INP_WLOCK(inp);
 				if (in_pcbrele(inp)) {
 					tcp_wlock_looped++;
 					inp = NULL;
 					goto findpcb;
 				}
 				tcp_wlock_relocked++;
 			} else {
 				ti_locked = TI_WLOCKED;
 				tcp_wlock_upgraded++;
 			}
 		}
 		INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
 	}
 
 #ifdef MAC
 	INP_WLOCK_ASSERT(inp);
 	if (mac_inpcb_check_deliver(inp, m))
 		goto dropunlock;
 #endif
 	so = inp->inp_socket;
 	KASSERT(so != NULL, ("%s: so == NULL", __func__));
 #ifdef TCPDEBUG
 	if (so->so_options & SO_DEBUG) {
 		ostate = tp->t_state;
 		if (isipv6) {
 #ifdef INET6
 			bcopy((char *)ip6, (char *)tcp_saveipgen, sizeof(*ip6));
 #endif
 		} else
 			bcopy((char *)ip, (char *)tcp_saveipgen, sizeof(*ip));
 		tcp_savetcp = *th;
 	}
 #endif
 	/*
 	 * When the socket is accepting connections (the INPCB is in LISTEN
 	 * state) we look into the SYN cache if this is a new connection
 	 * attempt or the completion of a previous one.
 	 */
 	if (so->so_options & SO_ACCEPTCONN) {
 		struct in_conninfo inc;
 
 		KASSERT(tp->t_state == TCPS_LISTEN, ("%s: so accepting but "
 		    "tp not listening", __func__));
 
 		bzero(&inc, sizeof(inc));
-		inc.inc_isipv6 = isipv6;
 #ifdef INET6
 		if (isipv6) {
+			inc.inc_flags |= INC_ISIPV6;
 			inc.inc6_faddr = ip6->ip6_src;
 			inc.inc6_laddr = ip6->ip6_dst;
 		} else
 #endif
 		{
 			inc.inc_faddr = ip->ip_src;
 			inc.inc_laddr = ip->ip_dst;
 		}
 		inc.inc_fport = th->th_sport;
 		inc.inc_lport = th->th_dport;
 
 		/*
 		 * Check for an existing connection attempt in syncache if
 		 * the flag is only ACK.  A successful lookup creates a new
 		 * socket appended to the listen queue in SYN_RECEIVED state.
 		 */
 		if ((thflags & (TH_RST|TH_ACK|TH_SYN)) == TH_ACK) {
 			/*
 			 * Parse the TCP options here because
 			 * syncookies need access to the reflected
 			 * timestamp.
 			 */
 			tcp_dooptions(&to, optp, optlen, 0);
 			/*
 			 * NB: syncache_expand() doesn't unlock
 			 * inp and tcpinfo locks.
 			 */
 			if (!syncache_expand(&inc, &to, th, &so, m)) {
 				/*
 				 * No syncache entry or ACK was not
 				 * for our SYN/ACK.  Send a RST.
 				 * NB: syncache did its own logging
 				 * of the failure cause.
 				 */
 				rstreason = BANDLIM_RST_OPENPORT;
 				goto dropwithreset;
 			}
 			if (so == NULL) {
 				/*
 				 * We completed the 3-way handshake
 				 * but could not allocate a socket
 				 * either due to memory shortage,
 				 * listen queue length limits or
 				 * global socket limits.  Send RST
 				 * or wait and have the remote end
 				 * retransmit the ACK for another
 				 * try.
 				 */
 				if ((s = tcp_log_addrs(&inc, th, NULL, NULL)))
 					log(LOG_DEBUG, "%s; %s: Listen socket: "
 					    "Socket allocation failed due to "
 					    "limits or memory shortage, %s\n",
 					    s, __func__,
 					    V_tcp_sc_rst_sock_fail ?
 					    "sending RST" : "try again");
 				if (V_tcp_sc_rst_sock_fail) {
 					rstreason = BANDLIM_UNLIMITED;
 					goto dropwithreset;
 				} else
 					goto dropunlock;
 			}
 			/*
 			 * Socket is created in state SYN_RECEIVED.
 			 * Unlock the listen socket, lock the newly
 			 * created socket and update the tp variable.
 			 */
 			INP_WUNLOCK(inp);	/* listen socket */
 			inp = sotoinpcb(so);
 			INP_WLOCK(inp);		/* new connection */
 			tp = intotcpcb(inp);
 			KASSERT(tp->t_state == TCPS_SYN_RECEIVED,
 			    ("%s: ", __func__));
 			/*
 			 * Process the segment and the data it
 			 * contains.  tcp_do_segment() consumes
 			 * the mbuf chain and unlocks the inpcb.
 			 */
 			tcp_do_segment(m, th, so, tp, drop_hdrlen, tlen,
 			    iptos, ti_locked);
 			INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
 			return;
 		}
 		/*
 		 * Segment flag validation for new connection attempts:
 		 *
 		 * Our (SYN|ACK) response was rejected.
 		 * Check with syncache and remove entry to prevent
 		 * retransmits.
 		 *
 		 * NB: syncache_chkrst does its own logging of failure
 		 * causes.
 		 */
 		if (thflags & TH_RST) {
 			syncache_chkrst(&inc, th);
 			goto dropunlock;
 		}
 		/*
 		 * We can't do anything without SYN.
 		 */
 		if ((thflags & TH_SYN) == 0) {
 			if ((s = tcp_log_addrs(&inc, th, NULL, NULL)))
 				log(LOG_DEBUG, "%s; %s: Listen socket: "
 				    "SYN is missing, segment ignored\n",
 				    s, __func__);
 			V_tcpstat.tcps_badsyn++;
 			goto dropunlock;
 		}
 		/*
 		 * (SYN|ACK) is bogus on a listen socket.
 		 */
 		if (thflags & TH_ACK) {
 			if ((s = tcp_log_addrs(&inc, th, NULL, NULL)))
 				log(LOG_DEBUG, "%s; %s: Listen socket: "
 				    "SYN|ACK invalid, segment rejected\n",
 				    s, __func__);
 			syncache_badack(&inc);	/* XXX: Not needed! */
 			V_tcpstat.tcps_badsyn++;
 			rstreason = BANDLIM_RST_OPENPORT;
 			goto dropwithreset;
 		}
 		/*
 		 * If the drop_synfin option is enabled, drop all
 		 * segments with both the SYN and FIN bits set.
 		 * This prevents e.g. nmap from identifying the
 		 * TCP/IP stack.
 		 * XXX: Poor reasoning.  nmap has other methods
 		 * and is constantly refining its stack detection
 		 * strategies.
 		 * XXX: This is a violation of the TCP specification
 		 * and was used by RFC1644.
 		 */
 		if ((thflags & TH_FIN) && V_drop_synfin) {
 			if ((s = tcp_log_addrs(&inc, th, NULL, NULL)))
 				log(LOG_DEBUG, "%s; %s: Listen socket: "
 				    "SYN|FIN segment ignored (based on "
 				    "sysctl setting)\n", s, __func__);
 			V_tcpstat.tcps_badsyn++;
                 	goto dropunlock;
 		}
 		/*
 		 * Segment's flags are (SYN) or (SYN|FIN).
 		 *
 		 * TH_PUSH, TH_URG, TH_ECE, TH_CWR are ignored
 		 * as they do not affect the state of the TCP FSM.
 		 * The data pointed to by TH_URG and th_urp is ignored.
 		 */
 		KASSERT((thflags & (TH_RST|TH_ACK)) == 0,
 		    ("%s: Listen socket: TH_RST or TH_ACK set", __func__));
 		KASSERT(thflags & (TH_SYN),
 		    ("%s: Listen socket: TH_SYN not set", __func__));
 #ifdef INET6
 		/*
 		 * If deprecated address is forbidden,
 		 * we do not accept SYN to deprecated interface
 		 * address to prevent any new inbound connection from
 		 * getting established.
 		 * When we do not accept SYN, we send a TCP RST,
 		 * with deprecated source address (instead of dropping
 		 * it).  We compromise it as it is much better for peer
 		 * to send a RST, and RST will be the final packet
 		 * for the exchange.
 		 *
 		 * If we do not forbid deprecated addresses, we accept
 		 * the SYN packet.  RFC2462 does not suggest dropping
 		 * SYN in this case.
 		 * If we decipher RFC2462 5.5.4, it says like this:
 		 * 1. use of deprecated addr with existing
 		 *    communication is okay - "SHOULD continue to be
 		 *    used"
 		 * 2. use of it with new communication:
 		 *   (2a) "SHOULD NOT be used if alternate address
 		 *        with sufficient scope is available"
 		 *   (2b) nothing mentioned otherwise.
 		 * Here we fall into (2b) case as we have no choice in
 		 * our source address selection - we must obey the peer.
 		 *
 		 * The wording in RFC2462 is confusing, and there are
 		 * multiple description text for deprecated address
 		 * handling - worse, they are not exactly the same.
 		 * I believe 5.5.4 is the best one, so we follow 5.5.4.
 		 */
 		if (isipv6 && !V_ip6_use_deprecated) {
 			struct in6_ifaddr *ia6;
 
 			if ((ia6 = ip6_getdstifaddr(m)) &&
 			    (ia6->ia6_flags & IN6_IFF_DEPRECATED)) {
 				if ((s = tcp_log_addrs(&inc, th, NULL, NULL)))
 				    log(LOG_DEBUG, "%s; %s: Listen socket: "
 					"Connection attempt to deprecated "
 					"IPv6 address rejected\n",
 					s, __func__);
 				rstreason = BANDLIM_RST_OPENPORT;
 				goto dropwithreset;
 			}
 		}
 #endif
 		/*
 		 * Basic sanity checks on incoming SYN requests:
 		 *   Don't respond if the destination is a link layer
 		 *	broadcast according to RFC1122 4.2.3.10, p. 104.
 		 *   If it is from this socket it must be forged.
 		 *   Don't respond if the source or destination is a
 		 *	global or subnet broad- or multicast address.
 		 *   Note that it is quite possible to receive unicast
 		 *	link-layer packets with a broadcast IP address. Use
 		 *	in_broadcast() to find them.
 		 */
 		if (m->m_flags & (M_BCAST|M_MCAST)) {
 			if ((s = tcp_log_addrs(&inc, th, NULL, NULL)))
 			    log(LOG_DEBUG, "%s; %s: Listen socket: "
 				"Connection attempt from broad- or multicast "
 				"link layer address ignored\n", s, __func__);
 			goto dropunlock;
 		}
 		if (isipv6) {
 #ifdef INET6
 			if (th->th_dport == th->th_sport &&
 			    IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, &ip6->ip6_src)) {
 				if ((s = tcp_log_addrs(&inc, th, NULL, NULL)))
 				    log(LOG_DEBUG, "%s; %s: Listen socket: "
 					"Connection attempt to/from self "
 					"ignored\n", s, __func__);
 				goto dropunlock;
 			}
 			if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) ||
 			    IN6_IS_ADDR_MULTICAST(&ip6->ip6_src)) {
 				if ((s = tcp_log_addrs(&inc, th, NULL, NULL)))
 				    log(LOG_DEBUG, "%s; %s: Listen socket: "
 					"Connection attempt from/to multicast "
 					"address ignored\n", s, __func__);
 				goto dropunlock;
 			}
 #endif
 		} else {
 			if (th->th_dport == th->th_sport &&
 			    ip->ip_dst.s_addr == ip->ip_src.s_addr) {
 				if ((s = tcp_log_addrs(&inc, th, NULL, NULL)))
 				    log(LOG_DEBUG, "%s; %s: Listen socket: "
 					"Connection attempt from/to self "
 					"ignored\n", s, __func__);
 				goto dropunlock;
 			}
 			if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) ||
 			    IN_MULTICAST(ntohl(ip->ip_src.s_addr)) ||
 			    ip->ip_src.s_addr == htonl(INADDR_BROADCAST) ||
 			    in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) {
 				if ((s = tcp_log_addrs(&inc, th, NULL, NULL)))
 				    log(LOG_DEBUG, "%s; %s: Listen socket: "
 					"Connection attempt from/to broad- "
 					"or multicast address ignored\n",
 					s, __func__);
 				goto dropunlock;
 			}
 		}
 		/*
 		 * SYN appears to be valid.  Create compressed TCP state
 		 * for syncache.
 		 */
 #ifdef TCPDEBUG
 		if (so->so_options & SO_DEBUG)
 			tcp_trace(TA_INPUT, ostate, tp,
 			    (void *)tcp_saveipgen, &tcp_savetcp, 0);
 #endif
 		tcp_dooptions(&to, optp, optlen, TO_SYN);
 		syncache_add(&inc, &to, th, inp, &so, m);
 		/*
 		 * Entry added to syncache and mbuf consumed.
 		 * Everything already unlocked by syncache_add().
 		 */
 		INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
 		return;
 	}
 
 	/*
 	 * Segment belongs to a connection in SYN_SENT, ESTABLISHED or later
 	 * state.  tcp_do_segment() always consumes the mbuf chain, unlocks
 	 * the inpcb, and unlocks pcbinfo.
 	 */
 	tcp_do_segment(m, th, so, tp, drop_hdrlen, tlen, iptos, ti_locked);
 	INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
 	return;
 
 dropwithreset:
 	if (ti_locked == TI_RLOCKED)
 		INP_INFO_RUNLOCK(&V_tcbinfo);
 	else if (ti_locked == TI_WLOCKED)
 		INP_INFO_WUNLOCK(&V_tcbinfo);
 	else
 		panic("%s: dropwithreset ti_locked %d", __func__, ti_locked);
 	ti_locked = TI_UNLOCKED;
 
 	if (inp != NULL) {
 		tcp_dropwithreset(m, th, tp, tlen, rstreason);
 		INP_WUNLOCK(inp);
 	} else
 		tcp_dropwithreset(m, th, NULL, tlen, rstreason);
 	m = NULL;	/* mbuf chain got consumed. */
 	goto drop;
 
 dropunlock:
 	if (ti_locked == TI_RLOCKED)
 		INP_INFO_RUNLOCK(&V_tcbinfo);
 	else if (ti_locked == TI_WLOCKED)
 		INP_INFO_WUNLOCK(&V_tcbinfo);
 	else
 		panic("%s: dropunlock ti_locked %d", __func__, ti_locked);
 	ti_locked = TI_UNLOCKED;
 
 	if (inp != NULL)
 		INP_WUNLOCK(inp);
 
 drop:
 	INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
 	if (s != NULL)
 		free(s, M_TCPLOG);
 	if (m != NULL)
 		m_freem(m);
 }
 
 static void
 tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
     struct tcpcb *tp, int drop_hdrlen, int tlen, uint8_t iptos,
     int ti_locked)
 {
 	INIT_VNET_INET(tp->t_vnet);
 	int thflags, acked, ourfinisacked, needoutput = 0;
 	int rstreason, todrop, win;
 	u_long tiwin;
 	struct tcpopt to;
 
 #ifdef TCPDEBUG
 	/*
 	 * The size of tcp_saveipgen must be the size of the max ip header,
 	 * now IPv6.
 	 */
 	u_char tcp_saveipgen[IP6_HDR_LEN];
 	struct tcphdr tcp_savetcp;
 	short ostate = 0;
 #endif
 	thflags = th->th_flags;
 
 	/*
 	 * If this is either a state-changing packet or current state isn't
 	 * established, we require a write lock on tcbinfo.  Otherwise, we
 	 * allow either a read lock or a write lock, as we may have acquired
 	 * a write lock due to a race.
 	 *
 	 * Require a global write lock for SYN/FIN/RST segments or
 	 * non-established connections; otherwise accept either a read or
 	 * write lock, as we may have conservatively acquired a write lock in
 	 * certain cases in tcp_input() (is this still true?).  Currently we
 	 * will never enter with no lock, so we try to drop it quickly in the
 	 * common pure ack/pure data cases.
 	 */
 	if ((thflags & (TH_SYN | TH_FIN | TH_RST)) != 0 ||
 	    tp->t_state != TCPS_ESTABLISHED) {
 		KASSERT(ti_locked == TI_WLOCKED, ("%s ti_locked %d for "
 		    "SYN/FIN/RST/!EST", __func__, ti_locked));
 		INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
 	} else {
 #ifdef INVARIANTS
 		if (ti_locked == TI_RLOCKED)
 			INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
 		else if (ti_locked == TI_WLOCKED)
 			INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
 		else
 			panic("%s: ti_locked %d for EST", __func__,
 			    ti_locked);
 #endif
 	}
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 	KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN",
 	    __func__));
 	KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT",
 	    __func__));
 
 	/*
 	 * Segment received on connection.
 	 * Reset idle time and keep-alive timer.
 	 * XXX: This should be done after segment
 	 * validation to ignore broken/spoofed segs.
 	 */
 	tp->t_rcvtime = ticks;
 	if (TCPS_HAVEESTABLISHED(tp->t_state))
 		tcp_timer_activate(tp, TT_KEEP, tcp_keepidle);
 
 	/*
 	 * Unscale the window into a 32-bit value.
 	 * For the SYN_SENT state the scale is zero.
 	 */
 	tiwin = th->th_win << tp->snd_scale;
 
 	/*
 	 * TCP ECN processing.
 	 */
 	if (tp->t_flags & TF_ECN_PERMIT) {
 		switch (iptos & IPTOS_ECN_MASK) {
 		case IPTOS_ECN_CE:
 			tp->t_flags |= TF_ECN_SND_ECE;
 			V_tcpstat.tcps_ecn_ce++;
 			break;
 		case IPTOS_ECN_ECT0:
 			V_tcpstat.tcps_ecn_ect0++;
 			break;
 		case IPTOS_ECN_ECT1:
 			V_tcpstat.tcps_ecn_ect1++;
 			break;
 		}
 
 		if (thflags & TH_CWR)
 			tp->t_flags &= ~TF_ECN_SND_ECE;
 
 		/*
 		 * Congestion experienced.
 		 * Ignore if we are already trying to recover.
 		 */
 		if ((thflags & TH_ECE) &&
 		    SEQ_LEQ(th->th_ack, tp->snd_recover)) {
 			V_tcpstat.tcps_ecn_rcwnd++;
 			tcp_congestion_exp(tp);
 		}
 	}
 
 	/*
 	 * Parse options on any incoming segment.
 	 */
 	tcp_dooptions(&to, (u_char *)(th + 1),
 	    (th->th_off << 2) - sizeof(struct tcphdr),
 	    (thflags & TH_SYN) ? TO_SYN : 0);
 
 	/*
 	 * If echoed timestamp is later than the current time,
 	 * fall back to non RFC1323 RTT calculation.  Normalize
 	 * timestamp if syncookies were used when this connection
 	 * was established.
 	 */
 	if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) {
 		to.to_tsecr -= tp->ts_offset;
 		if (TSTMP_GT(to.to_tsecr, ticks))
 			to.to_tsecr = 0;
 	}
 
 	/*
 	 * Process options only when we get SYN/ACK back. The SYN case
 	 * for incoming connections is handled in tcp_syncache.
 	 * According to RFC1323 the window field in a SYN (i.e., a <SYN>
 	 * or <SYN,ACK>) segment itself is never scaled.
 	 * XXX this is traditional behavior, may need to be cleaned up.
 	 */
 	if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) {
 		if ((to.to_flags & TOF_SCALE) &&
 		    (tp->t_flags & TF_REQ_SCALE)) {
 			tp->t_flags |= TF_RCVD_SCALE;
 			tp->snd_scale = to.to_wscale;
 		}
 		/*
 		 * Initial send window.  It will be updated with
 		 * the next incoming segment to the scaled value.
 		 */
 		tp->snd_wnd = th->th_win;
 		if (to.to_flags & TOF_TS) {
 			tp->t_flags |= TF_RCVD_TSTMP;
 			tp->ts_recent = to.to_tsval;
 			tp->ts_recent_age = ticks;
 		}
 		if (to.to_flags & TOF_MSS)
 			tcp_mss(tp, to.to_mss);
 		if ((tp->t_flags & TF_SACK_PERMIT) &&
 		    (to.to_flags & TOF_SACKPERM) == 0)
 			tp->t_flags &= ~TF_SACK_PERMIT;
 	}
 
 	/*
 	 * Header prediction: check for the two common cases
 	 * of a uni-directional data xfer.  If the packet has
 	 * no control flags, is in-sequence, the window didn't
 	 * change and we're not retransmitting, it's a
 	 * candidate.  If the length is zero and the ack moved
 	 * forward, we're the sender side of the xfer.  Just
 	 * free the data acked & wake any higher level process
 	 * that was blocked waiting for space.  If the length
 	 * is non-zero and the ack didn't move, we're the
 	 * receiver side.  If we're getting packets in-order
 	 * (the reassembly queue is empty), add the data to
 	 * the socket buffer and note that we need a delayed ack.
 	 * Make sure that the hidden state-flags are also off.
 	 * Since we check for TCPS_ESTABLISHED first, it can only
 	 * be TH_NEEDSYN.
 	 */
 	if (tp->t_state == TCPS_ESTABLISHED &&
 	    th->th_seq == tp->rcv_nxt &&
 	    (thflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK &&
 	    tp->snd_nxt == tp->snd_max &&
 	    tiwin && tiwin == tp->snd_wnd && 
 	    ((tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN)) == 0) &&
 	    LIST_EMPTY(&tp->t_segq) &&
 	    ((to.to_flags & TOF_TS) == 0 ||
 	     TSTMP_GEQ(to.to_tsval, tp->ts_recent)) ) {
 
 		/*
 		 * If last ACK falls within this segment's sequence numbers,
 		 * record the timestamp.
 		 * NOTE that the test is modified according to the latest
 		 * proposal of the tcplw@cray.com list (Braden 1993/04/26).
 		 */
 		if ((to.to_flags & TOF_TS) != 0 &&
 		    SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
 			tp->ts_recent_age = ticks;
 			tp->ts_recent = to.to_tsval;
 		}
 
 		if (tlen == 0) {
 			if (SEQ_GT(th->th_ack, tp->snd_una) &&
 			    SEQ_LEQ(th->th_ack, tp->snd_max) &&
 			    tp->snd_cwnd >= tp->snd_wnd &&
 			    ((!V_tcp_do_newreno &&
 			      !(tp->t_flags & TF_SACK_PERMIT) &&
 			      tp->t_dupacks < tcprexmtthresh) ||
 			     ((V_tcp_do_newreno ||
 			       (tp->t_flags & TF_SACK_PERMIT)) &&
 			      !IN_FASTRECOVERY(tp) &&
 			      (to.to_flags & TOF_SACK) == 0 &&
 			      TAILQ_EMPTY(&tp->snd_holes)))) {
 				/*
 				 * This is a pure ack for outstanding data.
 				 */
 				if (ti_locked == TI_RLOCKED)
 					INP_INFO_RUNLOCK(&V_tcbinfo);
 				else if (ti_locked == TI_WLOCKED)
 					INP_INFO_WUNLOCK(&V_tcbinfo);
 				else
 					panic("%s: ti_locked %d on pure ACK",
 					    __func__, ti_locked);
 				ti_locked = TI_UNLOCKED;
 
 				++V_tcpstat.tcps_predack;
 
 				/*
 				 * "bad retransmit" recovery.
 				 */
 				if (tp->t_rxtshift == 1 &&
 				    ticks < tp->t_badrxtwin) {
 					++V_tcpstat.tcps_sndrexmitbad;
 					tp->snd_cwnd = tp->snd_cwnd_prev;
 					tp->snd_ssthresh =
 					    tp->snd_ssthresh_prev;
 					tp->snd_recover = tp->snd_recover_prev;
 					if (tp->t_flags & TF_WASFRECOVERY)
 					    ENTER_FASTRECOVERY(tp);
 					tp->snd_nxt = tp->snd_max;
 					tp->t_badrxtwin = 0;
 				}
 
 				/*
 				 * Recalculate the transmit timer / rtt.
 				 *
 				 * Some boxes send broken timestamp replies
 				 * during the SYN+ACK phase, ignore
 				 * timestamps of 0 or we could calculate a
 				 * huge RTT and blow up the retransmit timer.
 				 */
 				if ((to.to_flags & TOF_TS) != 0 &&
 				    to.to_tsecr) {
 					if (!tp->t_rttlow ||
 					    tp->t_rttlow > ticks - to.to_tsecr)
 						tp->t_rttlow = ticks - to.to_tsecr;
 					tcp_xmit_timer(tp,
 					    ticks - to.to_tsecr + 1);
 				} else if (tp->t_rtttime &&
 				    SEQ_GT(th->th_ack, tp->t_rtseq)) {
 					if (!tp->t_rttlow ||
 					    tp->t_rttlow > ticks - tp->t_rtttime)
 						tp->t_rttlow = ticks - tp->t_rtttime;
 					tcp_xmit_timer(tp,
 							ticks - tp->t_rtttime);
 				}
 				tcp_xmit_bandwidth_limit(tp, th->th_ack);
 				acked = th->th_ack - tp->snd_una;
 				V_tcpstat.tcps_rcvackpack++;
 				V_tcpstat.tcps_rcvackbyte += acked;
 				sbdrop(&so->so_snd, acked);
 				if (SEQ_GT(tp->snd_una, tp->snd_recover) &&
 				    SEQ_LEQ(th->th_ack, tp->snd_recover))
 					tp->snd_recover = th->th_ack - 1;
 				tp->snd_una = th->th_ack;
 				/*
 				 * Pull snd_wl2 up to prevent seq wrap relative
 				 * to th_ack.
 				 */
 				tp->snd_wl2 = th->th_ack;
 				tp->t_dupacks = 0;
 				m_freem(m);
 				ND6_HINT(tp); /* Some progress has been made. */
 
 				/*
 				 * If all outstanding data are acked, stop
 				 * retransmit timer, otherwise restart timer
 				 * using current (possibly backed-off) value.
 				 * If process is waiting for space,
 				 * wakeup/selwakeup/signal.  If data
 				 * are ready to send, let tcp_output
 				 * decide between more output or persist.
 				 */
 #ifdef TCPDEBUG
 				if (so->so_options & SO_DEBUG)
 					tcp_trace(TA_INPUT, ostate, tp,
 					    (void *)tcp_saveipgen,
 					    &tcp_savetcp, 0);
 #endif
 				if (tp->snd_una == tp->snd_max)
 					tcp_timer_activate(tp, TT_REXMT, 0);
 				else if (!tcp_timer_active(tp, TT_PERSIST))
 					tcp_timer_activate(tp, TT_REXMT,
 						      tp->t_rxtcur);
 				sowwakeup(so);
 				if (so->so_snd.sb_cc)
 					(void) tcp_output(tp);
 				goto check_delack;
 			}
 		} else if (th->th_ack == tp->snd_una &&
 		    tlen <= sbspace(&so->so_rcv)) {
 			int newsize = 0;	/* automatic sockbuf scaling */
 
 			/*
 			 * This is a pure, in-sequence data packet with
 			 * nothing on the reassembly queue and we have enough
 			 * buffer space to take it.
 			 */
 			if (ti_locked == TI_RLOCKED)
 				INP_INFO_RUNLOCK(&V_tcbinfo);
 			else if (ti_locked == TI_WLOCKED)
 				INP_INFO_WUNLOCK(&V_tcbinfo);
 			else
 				panic("%s: ti_locked %d on pure data "
 				    "segment", __func__, ti_locked);
 			ti_locked = TI_UNLOCKED;
 
 			/* Clean receiver SACK report if present */
 			if ((tp->t_flags & TF_SACK_PERMIT) && tp->rcv_numsacks)
 				tcp_clean_sackreport(tp);
 			++V_tcpstat.tcps_preddat;
 			tp->rcv_nxt += tlen;
 			/*
 			 * Pull snd_wl1 up to prevent seq wrap relative to
 			 * th_seq.
 			 */
 			tp->snd_wl1 = th->th_seq;
 			/*
 			 * Pull rcv_up up to prevent seq wrap relative to
 			 * rcv_nxt.
 			 */
 			tp->rcv_up = tp->rcv_nxt;
 			V_tcpstat.tcps_rcvpack++;
 			V_tcpstat.tcps_rcvbyte += tlen;
 			ND6_HINT(tp);	/* Some progress has been made */
 #ifdef TCPDEBUG
 			if (so->so_options & SO_DEBUG)
 				tcp_trace(TA_INPUT, ostate, tp,
 				    (void *)tcp_saveipgen, &tcp_savetcp, 0);
 #endif
 		/*
 		 * Automatic sizing of receive socket buffer.  Often the send
 		 * buffer size is not optimally adjusted to the actual network
 		 * conditions at hand (delay bandwidth product).  Setting the
 		 * buffer size too small limits throughput on links with high
 		 * bandwidth and high delay (eg. trans-continental/oceanic links).
 		 *
 		 * On the receive side the socket buffer memory is only rarely
 		 * used to any significant extent.  This allows us to be much
 		 * more aggressive in scaling the receive socket buffer.  For
 		 * the case that the buffer space is actually used to a large
 		 * extent and we run out of kernel memory we can simply drop
 		 * the new segments; TCP on the sender will just retransmit it
 		 * later.  Setting the buffer size too big may only consume too
 		 * much kernel memory if the application doesn't read() from
 		 * the socket or packet loss or reordering makes use of the
 		 * reassembly queue.
 		 *
 		 * The criteria to step up the receive buffer one notch are:
 		 *  1. the number of bytes received during the time it takes
 		 *     one timestamp to be reflected back to us (the RTT);
 		 *  2. received bytes per RTT is within seven eighth of the
 		 *     current socket buffer size;
 		 *  3. receive buffer size has not hit maximal automatic size;
 		 *
 		 * This algorithm does one step per RTT at most and only if
 		 * we receive a bulk stream w/o packet losses or reorderings.
 		 * Shrinking the buffer during idle times is not necessary as
 		 * it doesn't consume any memory when idle.
 		 *
 		 * TODO: Only step up if the application is actually serving
 		 * the buffer to better manage the socket buffer resources.
 		 */
 			if (V_tcp_do_autorcvbuf &&
 			    to.to_tsecr &&
 			    (so->so_rcv.sb_flags & SB_AUTOSIZE)) {
 				if (to.to_tsecr > tp->rfbuf_ts &&
 				    to.to_tsecr - tp->rfbuf_ts < hz) {
 					if (tp->rfbuf_cnt >
 					    (so->so_rcv.sb_hiwat / 8 * 7) &&
 					    so->so_rcv.sb_hiwat <
 					    V_tcp_autorcvbuf_max) {
 						newsize =
 						    min(so->so_rcv.sb_hiwat +
 						    V_tcp_autorcvbuf_inc,
 						    V_tcp_autorcvbuf_max);
 					}
 					/* Start over with next RTT. */
 					tp->rfbuf_ts = 0;
 					tp->rfbuf_cnt = 0;
 				} else
 					tp->rfbuf_cnt += tlen;	/* add up */
 			}
 
 			/* Add data to socket buffer. */
 			SOCKBUF_LOCK(&so->so_rcv);
 			if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
 				m_freem(m);
 			} else {
 				/*
 				 * Set new socket buffer size.
 				 * Give up when limit is reached.
 				 */
 				if (newsize)
 					if (!sbreserve_locked(&so->so_rcv,
 					    newsize, so, NULL))
 						so->so_rcv.sb_flags &= ~SB_AUTOSIZE;
 				m_adj(m, drop_hdrlen);	/* delayed header drop */
 				sbappendstream_locked(&so->so_rcv, m);
 			}
 			/* NB: sorwakeup_locked() does an implicit unlock. */
 			sorwakeup_locked(so);
 			if (DELAY_ACK(tp)) {
 				tp->t_flags |= TF_DELACK;
 			} else {
 				tp->t_flags |= TF_ACKNOW;
 				tcp_output(tp);
 			}
 			goto check_delack;
 		}
 	}
 
 	/*
 	 * Calculate amount of space in receive window,
 	 * and then do TCP input processing.
 	 * Receive window is amount of space in rcv queue,
 	 * but not less than advertised window.
 	 */
 	win = sbspace(&so->so_rcv);
 	if (win < 0)
 		win = 0;
 	tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt));
 
 	/* Reset receive buffer auto scaling when not in bulk receive mode. */
 	tp->rfbuf_ts = 0;
 	tp->rfbuf_cnt = 0;
 
 	switch (tp->t_state) {
 
 	/*
 	 * If the state is SYN_RECEIVED:
 	 *	if seg contains an ACK, but not for our SYN/ACK, send a RST.
 	 */
 	case TCPS_SYN_RECEIVED:
 		if ((thflags & TH_ACK) &&
 		    (SEQ_LEQ(th->th_ack, tp->snd_una) ||
 		     SEQ_GT(th->th_ack, tp->snd_max))) {
 				rstreason = BANDLIM_RST_OPENPORT;
 				goto dropwithreset;
 		}
 		break;
 
 	/*
 	 * If the state is SYN_SENT:
 	 *	if seg contains an ACK, but not for our SYN, drop the input.
 	 *	if seg contains a RST, then drop the connection.
 	 *	if seg does not contain SYN, then drop it.
 	 * Otherwise this is an acceptable SYN segment
 	 *	initialize tp->rcv_nxt and tp->irs
 	 *	if seg contains ack then advance tp->snd_una
 	 *	if seg contains an ECE and ECN support is enabled, the stream
 	 *	    is ECN capable.
 	 *	if SYN has been acked change to ESTABLISHED else SYN_RCVD state
 	 *	arrange for segment to be acked (eventually)
 	 *	continue processing rest of data/controls, beginning with URG
 	 */
 	case TCPS_SYN_SENT:
 		if ((thflags & TH_ACK) &&
 		    (SEQ_LEQ(th->th_ack, tp->iss) ||
 		     SEQ_GT(th->th_ack, tp->snd_max))) {
 			rstreason = BANDLIM_UNLIMITED;
 			goto dropwithreset;
 		}
 		if ((thflags & (TH_ACK|TH_RST)) == (TH_ACK|TH_RST))
 			tp = tcp_drop(tp, ECONNREFUSED);
 		if (thflags & TH_RST)
 			goto drop;
 		if (!(thflags & TH_SYN))
 			goto drop;
 
 		tp->irs = th->th_seq;
 		tcp_rcvseqinit(tp);
 		if (thflags & TH_ACK) {
 			V_tcpstat.tcps_connects++;
 			soisconnected(so);
 #ifdef MAC
 			SOCK_LOCK(so);
 			mac_socketpeer_set_from_mbuf(m, so);
 			SOCK_UNLOCK(so);
 #endif
 			/* Do window scaling on this connection? */
 			if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
 				(TF_RCVD_SCALE|TF_REQ_SCALE)) {
 				tp->rcv_scale = tp->request_r_scale;
 			}
 			tp->rcv_adv += tp->rcv_wnd;
 			tp->snd_una++;		/* SYN is acked */
 			/*
 			 * If there's data, delay ACK; if there's also a FIN
 			 * ACKNOW will be turned on later.
 			 */
 			if (DELAY_ACK(tp) && tlen != 0)
 				tcp_timer_activate(tp, TT_DELACK,
 				    tcp_delacktime);
 			else
 				tp->t_flags |= TF_ACKNOW;
 
 			if ((thflags & TH_ECE) && V_tcp_do_ecn) {
 				tp->t_flags |= TF_ECN_PERMIT;
 				V_tcpstat.tcps_ecn_shs++;
 			}
 			
 			/*
 			 * Received <SYN,ACK> in SYN_SENT[*] state.
 			 * Transitions:
 			 *	SYN_SENT  --> ESTABLISHED
 			 *	SYN_SENT* --> FIN_WAIT_1
 			 */
 			tp->t_starttime = ticks;
 			if (tp->t_flags & TF_NEEDFIN) {
 				tp->t_state = TCPS_FIN_WAIT_1;
 				tp->t_flags &= ~TF_NEEDFIN;
 				thflags &= ~TH_SYN;
 			} else {
 				tp->t_state = TCPS_ESTABLISHED;
 				tcp_timer_activate(tp, TT_KEEP, tcp_keepidle);
 			}
 		} else {
 			/*
 			 * Received initial SYN in SYN-SENT[*] state =>
 			 * simultaneous open.  If segment contains CC option
 			 * and there is a cached CC, apply TAO test.
 			 * If it succeeds, connection is * half-synchronized.
 			 * Otherwise, do 3-way handshake:
 			 *        SYN-SENT -> SYN-RECEIVED
 			 *        SYN-SENT* -> SYN-RECEIVED*
 			 * If there was no CC option, clear cached CC value.
 			 */
 			tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN);
 			tcp_timer_activate(tp, TT_REXMT, 0);
 			tp->t_state = TCPS_SYN_RECEIVED;
 		}
 
 		KASSERT(ti_locked == TI_WLOCKED, ("%s: trimthenstep6: "
 		    "ti_locked %d", __func__, ti_locked));
 		INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
 		INP_WLOCK_ASSERT(tp->t_inpcb);
 
 		/*
 		 * Advance th->th_seq to correspond to first data byte.
 		 * If data, trim to stay within window,
 		 * dropping FIN if necessary.
 		 */
 		th->th_seq++;
 		if (tlen > tp->rcv_wnd) {
 			todrop = tlen - tp->rcv_wnd;
 			m_adj(m, -todrop);
 			tlen = tp->rcv_wnd;
 			thflags &= ~TH_FIN;
 			V_tcpstat.tcps_rcvpackafterwin++;
 			V_tcpstat.tcps_rcvbyteafterwin += todrop;
 		}
 		tp->snd_wl1 = th->th_seq - 1;
 		tp->rcv_up = th->th_seq;
 		/*
 		 * Client side of transaction: already sent SYN and data.
 		 * If the remote host used T/TCP to validate the SYN,
 		 * our data will be ACK'd; if so, enter normal data segment
 		 * processing in the middle of step 5, ack processing.
 		 * Otherwise, goto step 6.
 		 */
 		if (thflags & TH_ACK)
 			goto process_ACK;
 
 		goto step6;
 
 	/*
 	 * If the state is LAST_ACK or CLOSING or TIME_WAIT:
 	 *      do normal processing.
 	 *
 	 * NB: Leftover from RFC1644 T/TCP.  Cases to be reused later.
 	 */
 	case TCPS_LAST_ACK:
 	case TCPS_CLOSING:
 		break;  /* continue normal processing */
 	}
 
 	/*
 	 * States other than LISTEN or SYN_SENT.
 	 * First check the RST flag and sequence number since reset segments
 	 * are exempt from the timestamp and connection count tests.  This
 	 * fixes a bug introduced by the Stevens, vol. 2, p. 960 bugfix
 	 * below which allowed reset segments in half the sequence space
 	 * to fall though and be processed (which gives forged reset
 	 * segments with a random sequence number a 50 percent chance of
 	 * killing a connection).
 	 * Then check timestamp, if present.
 	 * Then check the connection count, if present.
 	 * Then check that at least some bytes of segment are within
 	 * receive window.  If segment begins before rcv_nxt,
 	 * drop leading data (and SYN); if nothing left, just ack.
 	 *
 	 *
 	 * If the RST bit is set, check the sequence number to see
 	 * if this is a valid reset segment.
 	 * RFC 793 page 37:
 	 *   In all states except SYN-SENT, all reset (RST) segments
 	 *   are validated by checking their SEQ-fields.  A reset is
 	 *   valid if its sequence number is in the window.
 	 * Note: this does not take into account delayed ACKs, so
 	 *   we should test against last_ack_sent instead of rcv_nxt.
 	 *   The sequence number in the reset segment is normally an
 	 *   echo of our outgoing acknowlegement numbers, but some hosts
 	 *   send a reset with the sequence number at the rightmost edge
 	 *   of our receive window, and we have to handle this case.
 	 * Note 2: Paul Watson's paper "Slipping in the Window" has shown
 	 *   that brute force RST attacks are possible.  To combat this,
 	 *   we use a much stricter check while in the ESTABLISHED state,
 	 *   only accepting RSTs where the sequence number is equal to
 	 *   last_ack_sent.  In all other states (the states in which a
 	 *   RST is more likely), the more permissive check is used.
 	 * If we have multiple segments in flight, the initial reset
 	 * segment sequence numbers will be to the left of last_ack_sent,
 	 * but they will eventually catch up.
 	 * In any case, it never made sense to trim reset segments to
 	 * fit the receive window since RFC 1122 says:
 	 *   4.2.2.12  RST Segment: RFC-793 Section 3.4
 	 *
 	 *    A TCP SHOULD allow a received RST segment to include data.
 	 *
 	 *    DISCUSSION
 	 *         It has been suggested that a RST segment could contain
 	 *         ASCII text that encoded and explained the cause of the
 	 *         RST.  No standard has yet been established for such
 	 *         data.
 	 *
 	 * If the reset segment passes the sequence number test examine
 	 * the state:
 	 *    SYN_RECEIVED STATE:
 	 *	If passive open, return to LISTEN state.
 	 *	If active open, inform user that connection was refused.
 	 *    ESTABLISHED, FIN_WAIT_1, FIN_WAIT_2, CLOSE_WAIT STATES:
 	 *	Inform user that connection was reset, and close tcb.
 	 *    CLOSING, LAST_ACK STATES:
 	 *	Close the tcb.
 	 *    TIME_WAIT STATE:
 	 *	Drop the segment - see Stevens, vol. 2, p. 964 and
 	 *      RFC 1337.
 	 */
 	if (thflags & TH_RST) {
 		if (SEQ_GEQ(th->th_seq, tp->last_ack_sent - 1) &&
 		    SEQ_LEQ(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) {
 			switch (tp->t_state) {
 
 			case TCPS_SYN_RECEIVED:
 				so->so_error = ECONNREFUSED;
 				goto close;
 
 			case TCPS_ESTABLISHED:
 				if (V_tcp_insecure_rst == 0 &&
 				    !(SEQ_GEQ(th->th_seq, tp->rcv_nxt - 1) &&
 				    SEQ_LEQ(th->th_seq, tp->rcv_nxt + 1)) &&
 				    !(SEQ_GEQ(th->th_seq, tp->last_ack_sent - 1) &&
 				    SEQ_LEQ(th->th_seq, tp->last_ack_sent + 1))) {
 					V_tcpstat.tcps_badrst++;
 					goto drop;
 				}
 				/* FALLTHROUGH */
 			case TCPS_FIN_WAIT_1:
 			case TCPS_FIN_WAIT_2:
 			case TCPS_CLOSE_WAIT:
 				so->so_error = ECONNRESET;
 			close:
 				KASSERT(ti_locked == TI_WLOCKED,
 				    ("tcp_do_segment: TH_RST 1 ti_locked %d",
 				    ti_locked));
 				INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
 
 				tp->t_state = TCPS_CLOSED;
 				V_tcpstat.tcps_drops++;
 				tp = tcp_close(tp);
 				break;
 
 			case TCPS_CLOSING:
 			case TCPS_LAST_ACK:
 				KASSERT(ti_locked == TI_WLOCKED,
 				    ("tcp_do_segment: TH_RST 2 ti_locked %d",
 				    ti_locked));
 				INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
 
 				tp = tcp_close(tp);
 				break;
 			}
 		}
 		goto drop;
 	}
 
 	/*
 	 * RFC 1323 PAWS: If we have a timestamp reply on this segment
 	 * and it's less than ts_recent, drop it.
 	 */
 	if ((to.to_flags & TOF_TS) != 0 && tp->ts_recent &&
 	    TSTMP_LT(to.to_tsval, tp->ts_recent)) {
 
 		/* Check to see if ts_recent is over 24 days old.  */
 		if ((int)(ticks - tp->ts_recent_age) > TCP_PAWS_IDLE) {
 			/*
 			 * Invalidate ts_recent.  If this segment updates
 			 * ts_recent, the age will be reset later and ts_recent
 			 * will get a valid value.  If it does not, setting
 			 * ts_recent to zero will at least satisfy the
 			 * requirement that zero be placed in the timestamp
 			 * echo reply when ts_recent isn't valid.  The
 			 * age isn't reset until we get a valid ts_recent
 			 * because we don't want out-of-order segments to be
 			 * dropped when ts_recent is old.
 			 */
 			tp->ts_recent = 0;
 		} else {
 			V_tcpstat.tcps_rcvduppack++;
 			V_tcpstat.tcps_rcvdupbyte += tlen;
 			V_tcpstat.tcps_pawsdrop++;
 			if (tlen)
 				goto dropafterack;
 			goto drop;
 		}
 	}
 
 	/*
 	 * In the SYN-RECEIVED state, validate that the packet belongs to
 	 * this connection before trimming the data to fit the receive
 	 * window.  Check the sequence number versus IRS since we know
 	 * the sequence numbers haven't wrapped.  This is a partial fix
 	 * for the "LAND" DoS attack.
 	 */
 	if (tp->t_state == TCPS_SYN_RECEIVED && SEQ_LT(th->th_seq, tp->irs)) {
 		rstreason = BANDLIM_RST_OPENPORT;
 		goto dropwithreset;
 	}
 
 	todrop = tp->rcv_nxt - th->th_seq;
 	if (todrop > 0) {
 		if (thflags & TH_SYN) {
 			thflags &= ~TH_SYN;
 			th->th_seq++;
 			if (th->th_urp > 1)
 				th->th_urp--;
 			else
 				thflags &= ~TH_URG;
 			todrop--;
 		}
 		/*
 		 * Following if statement from Stevens, vol. 2, p. 960.
 		 */
 		if (todrop > tlen
 		    || (todrop == tlen && (thflags & TH_FIN) == 0)) {
 			/*
 			 * Any valid FIN must be to the left of the window.
 			 * At this point the FIN must be a duplicate or out
 			 * of sequence; drop it.
 			 */
 			thflags &= ~TH_FIN;
 
 			/*
 			 * Send an ACK to resynchronize and drop any data.
 			 * But keep on processing for RST or ACK.
 			 */
 			tp->t_flags |= TF_ACKNOW;
 			todrop = tlen;
 			V_tcpstat.tcps_rcvduppack++;
 			V_tcpstat.tcps_rcvdupbyte += todrop;
 		} else {
 			V_tcpstat.tcps_rcvpartduppack++;
 			V_tcpstat.tcps_rcvpartdupbyte += todrop;
 		}
 		drop_hdrlen += todrop;	/* drop from the top afterwards */
 		th->th_seq += todrop;
 		tlen -= todrop;
 		if (th->th_urp > todrop)
 			th->th_urp -= todrop;
 		else {
 			thflags &= ~TH_URG;
 			th->th_urp = 0;
 		}
 	}
 
 	/*
 	 * If new data are received on a connection after the
 	 * user processes are gone, then RST the other end.
 	 */
 	if ((so->so_state & SS_NOFDREF) &&
 	    tp->t_state > TCPS_CLOSE_WAIT && tlen) {
 		char *s;
 
 		KASSERT(ti_locked == TI_WLOCKED, ("%s: SS_NOFDEREF && "
 		    "CLOSE_WAIT && tlen ti_locked %d", __func__, ti_locked));
 		INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
 
 		if ((s = tcp_log_addrs(&tp->t_inpcb->inp_inc, th, NULL, NULL))) {
 			log(LOG_DEBUG, "%s; %s: %s: Received %d bytes of data after socket "
 			    "was closed, sending RST and removing tcpcb\n",
 			    s, __func__, tcpstates[tp->t_state], tlen);
 			free(s, M_TCPLOG);
 		}
 		tp = tcp_close(tp);
 		V_tcpstat.tcps_rcvafterclose++;
 		rstreason = BANDLIM_UNLIMITED;
 		goto dropwithreset;
 	}
 
 	/*
 	 * If segment ends after window, drop trailing data
 	 * (and PUSH and FIN); if nothing left, just ACK.
 	 */
 	todrop = (th->th_seq + tlen) - (tp->rcv_nxt + tp->rcv_wnd);
 	if (todrop > 0) {
 		V_tcpstat.tcps_rcvpackafterwin++;
 		if (todrop >= tlen) {
 			V_tcpstat.tcps_rcvbyteafterwin += tlen;
 			/*
 			 * If window is closed can only take segments at
 			 * window edge, and have to drop data and PUSH from
 			 * incoming segments.  Continue processing, but
 			 * remember to ack.  Otherwise, drop segment
 			 * and ack.
 			 */
 			if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) {
 				tp->t_flags |= TF_ACKNOW;
 				V_tcpstat.tcps_rcvwinprobe++;
 			} else
 				goto dropafterack;
 		} else
 			V_tcpstat.tcps_rcvbyteafterwin += todrop;
 		m_adj(m, -todrop);
 		tlen -= todrop;
 		thflags &= ~(TH_PUSH|TH_FIN);
 	}
 
 	/*
 	 * If last ACK falls within this segment's sequence numbers,
 	 * record its timestamp.
 	 * NOTE: 
 	 * 1) That the test incorporates suggestions from the latest
 	 *    proposal of the tcplw@cray.com list (Braden 1993/04/26).
 	 * 2) That updating only on newer timestamps interferes with
 	 *    our earlier PAWS tests, so this check should be solely
 	 *    predicated on the sequence space of this segment.
 	 * 3) That we modify the segment boundary check to be 
 	 *        Last.ACK.Sent <= SEG.SEQ + SEG.Len  
 	 *    instead of RFC1323's
 	 *        Last.ACK.Sent < SEG.SEQ + SEG.Len,
 	 *    This modified check allows us to overcome RFC1323's
 	 *    limitations as described in Stevens TCP/IP Illustrated
 	 *    Vol. 2 p.869. In such cases, we can still calculate the
 	 *    RTT correctly when RCV.NXT == Last.ACK.Sent.
 	 */
 	if ((to.to_flags & TOF_TS) != 0 &&
 	    SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
 	    SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
 		((thflags & (TH_SYN|TH_FIN)) != 0))) {
 		tp->ts_recent_age = ticks;
 		tp->ts_recent = to.to_tsval;
 	}
 
 	/*
 	 * If a SYN is in the window, then this is an
 	 * error and we send an RST and drop the connection.
 	 */
 	if (thflags & TH_SYN) {
 		KASSERT(ti_locked == TI_WLOCKED,
 		    ("tcp_do_segment: TH_SYN ti_locked %d", ti_locked));
 		INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
 
 		tp = tcp_drop(tp, ECONNRESET);
 		rstreason = BANDLIM_UNLIMITED;
 		goto drop;
 	}
 
 	/*
 	 * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN
 	 * flag is on (half-synchronized state), then queue data for
 	 * later processing; else drop segment and return.
 	 */
 	if ((thflags & TH_ACK) == 0) {
 		if (tp->t_state == TCPS_SYN_RECEIVED ||
 		    (tp->t_flags & TF_NEEDSYN))
 			goto step6;
 		else if (tp->t_flags & TF_ACKNOW)
 			goto dropafterack;
 		else
 			goto drop;
 	}
 
 	/*
 	 * Ack processing.
 	 */
 	switch (tp->t_state) {
 
 	/*
 	 * In SYN_RECEIVED state, the ack ACKs our SYN, so enter
 	 * ESTABLISHED state and continue processing.
 	 * The ACK was checked above.
 	 */
 	case TCPS_SYN_RECEIVED:
 
 		V_tcpstat.tcps_connects++;
 		soisconnected(so);
 		/* Do window scaling? */
 		if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
 			(TF_RCVD_SCALE|TF_REQ_SCALE)) {
 			tp->rcv_scale = tp->request_r_scale;
 			tp->snd_wnd = tiwin;
 		}
 		/*
 		 * Make transitions:
 		 *      SYN-RECEIVED  -> ESTABLISHED
 		 *      SYN-RECEIVED* -> FIN-WAIT-1
 		 */
 		tp->t_starttime = ticks;
 		if (tp->t_flags & TF_NEEDFIN) {
 			tp->t_state = TCPS_FIN_WAIT_1;
 			tp->t_flags &= ~TF_NEEDFIN;
 		} else {
 			tp->t_state = TCPS_ESTABLISHED;
 			tcp_timer_activate(tp, TT_KEEP, tcp_keepidle);
 		}
 		/*
 		 * If segment contains data or ACK, will call tcp_reass()
 		 * later; if not, do so now to pass queued data to user.
 		 */
 		if (tlen == 0 && (thflags & TH_FIN) == 0)
 			(void) tcp_reass(tp, (struct tcphdr *)0, 0,
 			    (struct mbuf *)0);
 		tp->snd_wl1 = th->th_seq - 1;
 		/* FALLTHROUGH */
 
 	/*
 	 * In ESTABLISHED state: drop duplicate ACKs; ACK out of range
 	 * ACKs.  If the ack is in the range
 	 *	tp->snd_una < th->th_ack <= tp->snd_max
 	 * then advance tp->snd_una to th->th_ack and drop
 	 * data from the retransmission queue.  If this ACK reflects
 	 * more up to date window information we update our window information.
 	 */
 	case TCPS_ESTABLISHED:
 	case TCPS_FIN_WAIT_1:
 	case TCPS_FIN_WAIT_2:
 	case TCPS_CLOSE_WAIT:
 	case TCPS_CLOSING:
 	case TCPS_LAST_ACK:
 		if (SEQ_GT(th->th_ack, tp->snd_max)) {
 			V_tcpstat.tcps_rcvacktoomuch++;
 			goto dropafterack;
 		}
 		if ((tp->t_flags & TF_SACK_PERMIT) &&
 		    ((to.to_flags & TOF_SACK) ||
 		     !TAILQ_EMPTY(&tp->snd_holes)))
 			tcp_sack_doack(tp, &to, th->th_ack);
 		if (SEQ_LEQ(th->th_ack, tp->snd_una)) {
 			if (tlen == 0 && tiwin == tp->snd_wnd) {
 				V_tcpstat.tcps_rcvdupack++;
 				/*
 				 * If we have outstanding data (other than
 				 * a window probe), this is a completely
 				 * duplicate ack (ie, window info didn't
 				 * change), the ack is the biggest we've
 				 * seen and we've seen exactly our rexmt
 				 * threshhold of them, assume a packet
 				 * has been dropped and retransmit it.
 				 * Kludge snd_nxt & the congestion
 				 * window so we send only this one
 				 * packet.
 				 *
 				 * We know we're losing at the current
 				 * window size so do congestion avoidance
 				 * (set ssthresh to half the current window
 				 * and pull our congestion window back to
 				 * the new ssthresh).
 				 *
 				 * Dup acks mean that packets have left the
 				 * network (they're now cached at the receiver)
 				 * so bump cwnd by the amount in the receiver
 				 * to keep a constant cwnd packets in the
 				 * network.
 				 *
 				 * When using TCP ECN, notify the peer that
 				 * we reduced the cwnd.
 				 */
 				if (!tcp_timer_active(tp, TT_REXMT) ||
 				    th->th_ack != tp->snd_una)
 					tp->t_dupacks = 0;
 				else if (++tp->t_dupacks > tcprexmtthresh ||
 				    ((V_tcp_do_newreno ||
 				      (tp->t_flags & TF_SACK_PERMIT)) &&
 				     IN_FASTRECOVERY(tp))) {
 					if ((tp->t_flags & TF_SACK_PERMIT) &&
 					    IN_FASTRECOVERY(tp)) {
 						int awnd;
 						
 						/*
 						 * Compute the amount of data in flight first.
 						 * We can inject new data into the pipe iff 
 						 * we have less than 1/2 the original window's 	
 						 * worth of data in flight.
 						 */
 						awnd = (tp->snd_nxt - tp->snd_fack) +
 							tp->sackhint.sack_bytes_rexmit;
 						if (awnd < tp->snd_ssthresh) {
 							tp->snd_cwnd += tp->t_maxseg;
 							if (tp->snd_cwnd > tp->snd_ssthresh)
 								tp->snd_cwnd = tp->snd_ssthresh;
 						}
 					} else
 						tp->snd_cwnd += tp->t_maxseg;
 					(void) tcp_output(tp);
 					goto drop;
 				} else if (tp->t_dupacks == tcprexmtthresh) {
 					tcp_seq onxt = tp->snd_nxt;
 
 					/*
 					 * If we're doing sack, check to
 					 * see if we're already in sack
 					 * recovery. If we're not doing sack,
 					 * check to see if we're in newreno
 					 * recovery.
 					 */
 					if (tp->t_flags & TF_SACK_PERMIT) {
 						if (IN_FASTRECOVERY(tp)) {
 							tp->t_dupacks = 0;
 							break;
 						}
 					} else if (V_tcp_do_newreno ||
 					    V_tcp_do_ecn) {
 						if (SEQ_LEQ(th->th_ack,
 						    tp->snd_recover)) {
 							tp->t_dupacks = 0;
 							break;
 						}
 					}
 					tcp_congestion_exp(tp);
 					tcp_timer_activate(tp, TT_REXMT, 0);
 					tp->t_rtttime = 0;
 					if (tp->t_flags & TF_SACK_PERMIT) {
 						V_tcpstat.tcps_sack_recovery_episode++;
 						tp->sack_newdata = tp->snd_nxt;
 						tp->snd_cwnd = tp->t_maxseg;
 						(void) tcp_output(tp);
 						goto drop;
 					}
 					tp->snd_nxt = th->th_ack;
 					tp->snd_cwnd = tp->t_maxseg;
 					(void) tcp_output(tp);
 					KASSERT(tp->snd_limited <= 2,
 					    ("%s: tp->snd_limited too big",
 					    __func__));
 					tp->snd_cwnd = tp->snd_ssthresh +
 					     tp->t_maxseg *
 					     (tp->t_dupacks - tp->snd_limited);
 					if (SEQ_GT(onxt, tp->snd_nxt))
 						tp->snd_nxt = onxt;
 					goto drop;
 				} else if (V_tcp_do_rfc3042) {
 					u_long oldcwnd = tp->snd_cwnd;
 					tcp_seq oldsndmax = tp->snd_max;
 					u_int sent;
 
 					KASSERT(tp->t_dupacks == 1 ||
 					    tp->t_dupacks == 2,
 					    ("%s: dupacks not 1 or 2",
 					    __func__));
 					if (tp->t_dupacks == 1)
 						tp->snd_limited = 0;
 					tp->snd_cwnd =
 					    (tp->snd_nxt - tp->snd_una) +
 					    (tp->t_dupacks - tp->snd_limited) *
 					    tp->t_maxseg;
 					(void) tcp_output(tp);
 					sent = tp->snd_max - oldsndmax;
 					if (sent > tp->t_maxseg) {
 						KASSERT((tp->t_dupacks == 2 &&
 						    tp->snd_limited == 0) ||
 						   (sent == tp->t_maxseg + 1 &&
 						    tp->t_flags & TF_SENTFIN),
 						    ("%s: sent too much",
 						    __func__));
 						tp->snd_limited = 2;
 					} else if (sent > 0)
 						++tp->snd_limited;
 					tp->snd_cwnd = oldcwnd;
 					goto drop;
 				}
 			} else
 				tp->t_dupacks = 0;
 			break;
 		}
 
 		KASSERT(SEQ_GT(th->th_ack, tp->snd_una),
 		    ("%s: th_ack <= snd_una", __func__));
 
 		/*
 		 * If the congestion window was inflated to account
 		 * for the other side's cached packets, retract it.
 		 */
 		if (V_tcp_do_newreno || (tp->t_flags & TF_SACK_PERMIT)) {
 			if (IN_FASTRECOVERY(tp)) {
 				if (SEQ_LT(th->th_ack, tp->snd_recover)) {
 					if (tp->t_flags & TF_SACK_PERMIT)
 						tcp_sack_partialack(tp, th);
 					else
 						tcp_newreno_partial_ack(tp, th);
 				} else {
 					/*
 					 * Out of fast recovery.
 					 * Window inflation should have left us
 					 * with approximately snd_ssthresh
 					 * outstanding data.
 					 * But in case we would be inclined to
 					 * send a burst, better to do it via
 					 * the slow start mechanism.
 					 */
 					if (SEQ_GT(th->th_ack +
 							tp->snd_ssthresh,
 						   tp->snd_max))
 						tp->snd_cwnd = tp->snd_max -
 								th->th_ack +
 								tp->t_maxseg;
 					else
 						tp->snd_cwnd = tp->snd_ssthresh;
 				}
 			}
 		} else {
 			if (tp->t_dupacks >= tcprexmtthresh &&
 			    tp->snd_cwnd > tp->snd_ssthresh)
 				tp->snd_cwnd = tp->snd_ssthresh;
 		}
 		tp->t_dupacks = 0;
 		/*
 		 * If we reach this point, ACK is not a duplicate,
 		 *     i.e., it ACKs something we sent.
 		 */
 		if (tp->t_flags & TF_NEEDSYN) {
 			/*
 			 * T/TCP: Connection was half-synchronized, and our
 			 * SYN has been ACK'd (so connection is now fully
 			 * synchronized).  Go to non-starred state,
 			 * increment snd_una for ACK of SYN, and check if
 			 * we can do window scaling.
 			 */
 			tp->t_flags &= ~TF_NEEDSYN;
 			tp->snd_una++;
 			/* Do window scaling? */
 			if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
 				(TF_RCVD_SCALE|TF_REQ_SCALE)) {
 				tp->rcv_scale = tp->request_r_scale;
 				/* Send window already scaled. */
 			}
 		}
 
 process_ACK:
 		INP_INFO_LOCK_ASSERT(&V_tcbinfo);
 		KASSERT(ti_locked == TI_RLOCKED || ti_locked == TI_WLOCKED,
 		    ("tcp_input: process_ACK ti_locked %d", ti_locked));
 		INP_WLOCK_ASSERT(tp->t_inpcb);
 
 		acked = th->th_ack - tp->snd_una;
 		V_tcpstat.tcps_rcvackpack++;
 		V_tcpstat.tcps_rcvackbyte += acked;
 
 		/*
 		 * If we just performed our first retransmit, and the ACK
 		 * arrives within our recovery window, then it was a mistake
 		 * to do the retransmit in the first place.  Recover our
 		 * original cwnd and ssthresh, and proceed to transmit where
 		 * we left off.
 		 */
 		if (tp->t_rxtshift == 1 && ticks < tp->t_badrxtwin) {
 			++V_tcpstat.tcps_sndrexmitbad;
 			tp->snd_cwnd = tp->snd_cwnd_prev;
 			tp->snd_ssthresh = tp->snd_ssthresh_prev;
 			tp->snd_recover = tp->snd_recover_prev;
 			if (tp->t_flags & TF_WASFRECOVERY)
 				ENTER_FASTRECOVERY(tp);
 			tp->snd_nxt = tp->snd_max;
 			tp->t_badrxtwin = 0;	/* XXX probably not required */
 		}
 
 		/*
 		 * If we have a timestamp reply, update smoothed
 		 * round trip time.  If no timestamp is present but
 		 * transmit timer is running and timed sequence
 		 * number was acked, update smoothed round trip time.
 		 * Since we now have an rtt measurement, cancel the
 		 * timer backoff (cf., Phil Karn's retransmit alg.).
 		 * Recompute the initial retransmit timer.
 		 *
 		 * Some boxes send broken timestamp replies
 		 * during the SYN+ACK phase, ignore
 		 * timestamps of 0 or we could calculate a
 		 * huge RTT and blow up the retransmit timer.
 		 */
 		if ((to.to_flags & TOF_TS) != 0 &&
 		    to.to_tsecr) {
 			if (!tp->t_rttlow || tp->t_rttlow > ticks - to.to_tsecr)
 				tp->t_rttlow = ticks - to.to_tsecr;
 			tcp_xmit_timer(tp, ticks - to.to_tsecr + 1);
 		} else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) {
 			if (!tp->t_rttlow || tp->t_rttlow > ticks - tp->t_rtttime)
 				tp->t_rttlow = ticks - tp->t_rtttime;
 			tcp_xmit_timer(tp, ticks - tp->t_rtttime);
 		}
 		tcp_xmit_bandwidth_limit(tp, th->th_ack);
 
 		/*
 		 * If all outstanding data is acked, stop retransmit
 		 * timer and remember to restart (more output or persist).
 		 * If there is more data to be acked, restart retransmit
 		 * timer, using current (possibly backed-off) value.
 		 */
 		if (th->th_ack == tp->snd_max) {
 			tcp_timer_activate(tp, TT_REXMT, 0);
 			needoutput = 1;
 		} else if (!tcp_timer_active(tp, TT_PERSIST))
 			tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur);
 
 		/*
 		 * If no data (only SYN) was ACK'd,
 		 *    skip rest of ACK processing.
 		 */
 		if (acked == 0)
 			goto step6;
 
 		/*
 		 * When new data is acked, open the congestion window.
 		 * If the window gives us less than ssthresh packets
 		 * in flight, open exponentially (maxseg per packet).
 		 * Otherwise open linearly: maxseg per window
 		 * (maxseg^2 / cwnd per packet).
 		 * If cwnd > maxseg^2, fix the cwnd increment at 1 byte
 		 * to avoid capping cwnd (as suggested in RFC2581).
 		 */
 		if ((!V_tcp_do_newreno && !(tp->t_flags & TF_SACK_PERMIT)) ||
 		    !IN_FASTRECOVERY(tp)) {
 			u_int cw = tp->snd_cwnd;
 			u_int incr = tp->t_maxseg;
 			if (cw > tp->snd_ssthresh)
 				incr = max((incr * incr / cw), 1);
 			tp->snd_cwnd = min(cw+incr, TCP_MAXWIN<<tp->snd_scale);
 		}
 		SOCKBUF_LOCK(&so->so_snd);
 		if (acked > so->so_snd.sb_cc) {
 			tp->snd_wnd -= so->so_snd.sb_cc;
 			sbdrop_locked(&so->so_snd, (int)so->so_snd.sb_cc);
 			ourfinisacked = 1;
 		} else {
 			sbdrop_locked(&so->so_snd, acked);
 			tp->snd_wnd -= acked;
 			ourfinisacked = 0;
 		}
 		/* NB: sowwakeup_locked() does an implicit unlock. */
 		sowwakeup_locked(so);
 		/* Detect una wraparound. */
 		if ((V_tcp_do_newreno || (tp->t_flags & TF_SACK_PERMIT)) &&
 		    !IN_FASTRECOVERY(tp) &&
 		    SEQ_GT(tp->snd_una, tp->snd_recover) &&
 		    SEQ_LEQ(th->th_ack, tp->snd_recover))
 			tp->snd_recover = th->th_ack - 1;
 		if ((V_tcp_do_newreno || (tp->t_flags & TF_SACK_PERMIT)) &&
 		    IN_FASTRECOVERY(tp) &&
 		    SEQ_GEQ(th->th_ack, tp->snd_recover))
 			EXIT_FASTRECOVERY(tp);
 		tp->snd_una = th->th_ack;
 		if (tp->t_flags & TF_SACK_PERMIT) {
 			if (SEQ_GT(tp->snd_una, tp->snd_recover))
 				tp->snd_recover = tp->snd_una;
 		}
 		if (SEQ_LT(tp->snd_nxt, tp->snd_una))
 			tp->snd_nxt = tp->snd_una;
 
 		switch (tp->t_state) {
 
 		/*
 		 * In FIN_WAIT_1 STATE in addition to the processing
 		 * for the ESTABLISHED state if our FIN is now acknowledged
 		 * then enter FIN_WAIT_2.
 		 */
 		case TCPS_FIN_WAIT_1:
 			if (ourfinisacked) {
 				/*
 				 * If we can't receive any more
 				 * data, then closing user can proceed.
 				 * Starting the timer is contrary to the
 				 * specification, but if we don't get a FIN
 				 * we'll hang forever.
 				 *
 				 * XXXjl:
 				 * we should release the tp also, and use a
 				 * compressed state.
 				 */
 				if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
 					int timeout;
 
 					soisdisconnected(so);
 					timeout = (tcp_fast_finwait2_recycle) ? 
 						tcp_finwait2_timeout : tcp_maxidle;
 					tcp_timer_activate(tp, TT_2MSL, timeout);
 				}
 				tp->t_state = TCPS_FIN_WAIT_2;
 			}
 			break;
 
 		/*
 		 * In CLOSING STATE in addition to the processing for
 		 * the ESTABLISHED state if the ACK acknowledges our FIN
 		 * then enter the TIME-WAIT state, otherwise ignore
 		 * the segment.
 		 */
 		case TCPS_CLOSING:
 			if (ourfinisacked) {
 				INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
 				tcp_twstart(tp);
 				INP_INFO_WUNLOCK(&V_tcbinfo);
 				m_freem(m);
 				return;
 			}
 			break;
 
 		/*
 		 * In LAST_ACK, we may still be waiting for data to drain
 		 * and/or to be acked, as well as for the ack of our FIN.
 		 * If our FIN is now acknowledged, delete the TCB,
 		 * enter the closed state and return.
 		 */
 		case TCPS_LAST_ACK:
 			if (ourfinisacked) {
 				INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
 				tp = tcp_close(tp);
 				goto drop;
 			}
 			break;
 		}
 	}
 
 step6:
 	INP_INFO_LOCK_ASSERT(&V_tcbinfo);
 	KASSERT(ti_locked == TI_RLOCKED || ti_locked == TI_WLOCKED,
 	    ("tcp_do_segment: step6 ti_locked %d", ti_locked));
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 
 	/*
 	 * Update window information.
 	 * Don't look at window if no ACK: TAC's send garbage on first SYN.
 	 */
 	if ((thflags & TH_ACK) &&
 	    (SEQ_LT(tp->snd_wl1, th->th_seq) ||
 	    (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) ||
 	     (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) {
 		/* keep track of pure window updates */
 		if (tlen == 0 &&
 		    tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd)
 			V_tcpstat.tcps_rcvwinupd++;
 		tp->snd_wnd = tiwin;
 		tp->snd_wl1 = th->th_seq;
 		tp->snd_wl2 = th->th_ack;
 		if (tp->snd_wnd > tp->max_sndwnd)
 			tp->max_sndwnd = tp->snd_wnd;
 		needoutput = 1;
 	}
 
 	/*
 	 * Process segments with URG.
 	 */
 	if ((thflags & TH_URG) && th->th_urp &&
 	    TCPS_HAVERCVDFIN(tp->t_state) == 0) {
 		/*
 		 * This is a kludge, but if we receive and accept
 		 * random urgent pointers, we'll crash in
 		 * soreceive.  It's hard to imagine someone
 		 * actually wanting to send this much urgent data.
 		 */
 		SOCKBUF_LOCK(&so->so_rcv);
 		if (th->th_urp + so->so_rcv.sb_cc > sb_max) {
 			th->th_urp = 0;			/* XXX */
 			thflags &= ~TH_URG;		/* XXX */
 			SOCKBUF_UNLOCK(&so->so_rcv);	/* XXX */
 			goto dodata;			/* XXX */
 		}
 		/*
 		 * If this segment advances the known urgent pointer,
 		 * then mark the data stream.  This should not happen
 		 * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since
 		 * a FIN has been received from the remote side.
 		 * In these states we ignore the URG.
 		 *
 		 * According to RFC961 (Assigned Protocols),
 		 * the urgent pointer points to the last octet
 		 * of urgent data.  We continue, however,
 		 * to consider it to indicate the first octet
 		 * of data past the urgent section as the original
 		 * spec states (in one of two places).
 		 */
 		if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) {
 			tp->rcv_up = th->th_seq + th->th_urp;
 			so->so_oobmark = so->so_rcv.sb_cc +
 			    (tp->rcv_up - tp->rcv_nxt) - 1;
 			if (so->so_oobmark == 0)
 				so->so_rcv.sb_state |= SBS_RCVATMARK;
 			sohasoutofband(so);
 			tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA);
 		}
 		SOCKBUF_UNLOCK(&so->so_rcv);
 		/*
 		 * Remove out of band data so doesn't get presented to user.
 		 * This can happen independent of advancing the URG pointer,
 		 * but if two URG's are pending at once, some out-of-band
 		 * data may creep in... ick.
 		 */
 		if (th->th_urp <= (u_long)tlen &&
 		    !(so->so_options & SO_OOBINLINE)) {
 			/* hdr drop is delayed */
 			tcp_pulloutofband(so, th, m, drop_hdrlen);
 		}
 	} else {
 		/*
 		 * If no out of band data is expected,
 		 * pull receive urgent pointer along
 		 * with the receive window.
 		 */
 		if (SEQ_GT(tp->rcv_nxt, tp->rcv_up))
 			tp->rcv_up = tp->rcv_nxt;
 	}
 dodata:							/* XXX */
 	INP_INFO_LOCK_ASSERT(&V_tcbinfo);
 	KASSERT(ti_locked == TI_RLOCKED || ti_locked == TI_WLOCKED,
 	    ("tcp_do_segment: dodata ti_locked %d", ti_locked));
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 
 	/*
 	 * Process the segment text, merging it into the TCP sequencing queue,
 	 * and arranging for acknowledgment of receipt if necessary.
 	 * This process logically involves adjusting tp->rcv_wnd as data
 	 * is presented to the user (this happens in tcp_usrreq.c,
 	 * case PRU_RCVD).  If a FIN has already been received on this
 	 * connection then we just ignore the text.
 	 */
 	if ((tlen || (thflags & TH_FIN)) &&
 	    TCPS_HAVERCVDFIN(tp->t_state) == 0) {
 		tcp_seq save_start = th->th_seq;
 		m_adj(m, drop_hdrlen);	/* delayed header drop */
 		/*
 		 * Insert segment which includes th into TCP reassembly queue
 		 * with control block tp.  Set thflags to whether reassembly now
 		 * includes a segment with FIN.  This handles the common case
 		 * inline (segment is the next to be received on an established
 		 * connection, and the queue is empty), avoiding linkage into
 		 * and removal from the queue and repetition of various
 		 * conversions.
 		 * Set DELACK for segments received in order, but ack
 		 * immediately when segments are out of order (so
 		 * fast retransmit can work).
 		 */
 		if (th->th_seq == tp->rcv_nxt &&
 		    LIST_EMPTY(&tp->t_segq) &&
 		    TCPS_HAVEESTABLISHED(tp->t_state)) {
 			if (DELAY_ACK(tp))
 				tp->t_flags |= TF_DELACK;
 			else
 				tp->t_flags |= TF_ACKNOW;
 			tp->rcv_nxt += tlen;
 			thflags = th->th_flags & TH_FIN;
 			V_tcpstat.tcps_rcvpack++;
 			V_tcpstat.tcps_rcvbyte += tlen;
 			ND6_HINT(tp);
 			SOCKBUF_LOCK(&so->so_rcv);
 			if (so->so_rcv.sb_state & SBS_CANTRCVMORE)
 				m_freem(m);
 			else
 				sbappendstream_locked(&so->so_rcv, m);
 			/* NB: sorwakeup_locked() does an implicit unlock. */
 			sorwakeup_locked(so);
 		} else {
 			/*
 			 * XXX: Due to the header drop above "th" is
 			 * theoretically invalid by now.  Fortunately
 			 * m_adj() doesn't actually frees any mbufs
 			 * when trimming from the head.
 			 */
 			thflags = tcp_reass(tp, th, &tlen, m);
 			tp->t_flags |= TF_ACKNOW;
 		}
 		if (tlen > 0 && (tp->t_flags & TF_SACK_PERMIT))
 			tcp_update_sack_list(tp, save_start, save_start + tlen);
 #if 0
 		/*
 		 * Note the amount of data that peer has sent into
 		 * our window, in order to estimate the sender's
 		 * buffer size.
 		 * XXX: Unused.
 		 */
 		len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt);
 #endif
 	} else {
 		m_freem(m);
 		thflags &= ~TH_FIN;
 	}
 
 	/*
 	 * If FIN is received ACK the FIN and let the user know
 	 * that the connection is closing.
 	 */
 	if (thflags & TH_FIN) {
 		if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
 			socantrcvmore(so);
 			/*
 			 * If connection is half-synchronized
 			 * (ie NEEDSYN flag on) then delay ACK,
 			 * so it may be piggybacked when SYN is sent.
 			 * Otherwise, since we received a FIN then no
 			 * more input can be expected, send ACK now.
 			 */
 			if (tp->t_flags & TF_NEEDSYN)
 				tp->t_flags |= TF_DELACK;
 			else
 				tp->t_flags |= TF_ACKNOW;
 			tp->rcv_nxt++;
 		}
 		switch (tp->t_state) {
 
 		/*
 		 * In SYN_RECEIVED and ESTABLISHED STATES
 		 * enter the CLOSE_WAIT state.
 		 */
 		case TCPS_SYN_RECEIVED:
 			tp->t_starttime = ticks;
 			/* FALLTHROUGH */
 		case TCPS_ESTABLISHED:
 			tp->t_state = TCPS_CLOSE_WAIT;
 			break;
 
 		/*
 		 * If still in FIN_WAIT_1 STATE FIN has not been acked so
 		 * enter the CLOSING state.
 		 */
 		case TCPS_FIN_WAIT_1:
 			tp->t_state = TCPS_CLOSING;
 			break;
 
 		/*
 		 * In FIN_WAIT_2 state enter the TIME_WAIT state,
 		 * starting the time-wait timer, turning off the other
 		 * standard timers.
 		 */
 		case TCPS_FIN_WAIT_2:
 			INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
 			KASSERT(ti_locked == TI_WLOCKED, ("%s: dodata "
 			    "TCP_FIN_WAIT_2 ti_locked: %d", __func__,
 			    ti_locked));
 
 			tcp_twstart(tp);
 			INP_INFO_WUNLOCK(&V_tcbinfo);
 			return;
 		}
 	}
 	if (ti_locked == TI_RLOCKED)
 		INP_INFO_RUNLOCK(&V_tcbinfo);
 	else if (ti_locked == TI_WLOCKED)
 		INP_INFO_WUNLOCK(&V_tcbinfo);
 	else
 		panic("%s: dodata epilogue ti_locked %d", __func__,
 		    ti_locked);
 	ti_locked = TI_UNLOCKED;
 
 #ifdef TCPDEBUG
 	if (so->so_options & SO_DEBUG)
 		tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen,
 			  &tcp_savetcp, 0);
 #endif
 
 	/*
 	 * Return any desired output.
 	 */
 	if (needoutput || (tp->t_flags & TF_ACKNOW))
 		(void) tcp_output(tp);
 
 check_delack:
 	KASSERT(ti_locked == TI_UNLOCKED, ("%s: check_delack ti_locked %d",
 	    __func__, ti_locked));
 	INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 
 	if (tp->t_flags & TF_DELACK) {
 		tp->t_flags &= ~TF_DELACK;
 		tcp_timer_activate(tp, TT_DELACK, tcp_delacktime);
 	}
 	INP_WUNLOCK(tp->t_inpcb);
 	return;
 
 dropafterack:
 	KASSERT(ti_locked == TI_RLOCKED || ti_locked == TI_WLOCKED,
 	    ("tcp_do_segment: dropafterack ti_locked %d", ti_locked));
 
 	/*
 	 * Generate an ACK dropping incoming segment if it occupies
 	 * sequence space, where the ACK reflects our state.
 	 *
 	 * We can now skip the test for the RST flag since all
 	 * paths to this code happen after packets containing
 	 * RST have been dropped.
 	 *
 	 * In the SYN-RECEIVED state, don't send an ACK unless the
 	 * segment we received passes the SYN-RECEIVED ACK test.
 	 * If it fails send a RST.  This breaks the loop in the
 	 * "LAND" DoS attack, and also prevents an ACK storm
 	 * between two listening ports that have been sent forged
 	 * SYN segments, each with the source address of the other.
 	 */
 	if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) &&
 	    (SEQ_GT(tp->snd_una, th->th_ack) ||
 	     SEQ_GT(th->th_ack, tp->snd_max)) ) {
 		rstreason = BANDLIM_RST_OPENPORT;
 		goto dropwithreset;
 	}
 #ifdef TCPDEBUG
 	if (so->so_options & SO_DEBUG)
 		tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen,
 			  &tcp_savetcp, 0);
 #endif
 	if (ti_locked == TI_RLOCKED)
 		INP_INFO_RUNLOCK(&V_tcbinfo);
 	else if (ti_locked == TI_WLOCKED)
 		INP_INFO_WUNLOCK(&V_tcbinfo);
 	else
 		panic("%s: dropafterack epilogue ti_locked %d", __func__,
 		    ti_locked);
 	ti_locked = TI_UNLOCKED;
 
 	tp->t_flags |= TF_ACKNOW;
 	(void) tcp_output(tp);
 	INP_WUNLOCK(tp->t_inpcb);
 	m_freem(m);
 	return;
 
 dropwithreset:
 	if (ti_locked == TI_RLOCKED)
 		INP_INFO_RUNLOCK(&V_tcbinfo);
 	else if (ti_locked == TI_WLOCKED)
 		INP_INFO_WUNLOCK(&V_tcbinfo);
 	else
 		panic("%s: dropwithreset ti_locked %d", __func__, ti_locked);
 	ti_locked = TI_UNLOCKED;
 
 	if (tp != NULL) {
 		tcp_dropwithreset(m, th, tp, tlen, rstreason);
 		INP_WUNLOCK(tp->t_inpcb);
 	} else
 		tcp_dropwithreset(m, th, NULL, tlen, rstreason);
 	return;
 
 drop:
 	if (ti_locked == TI_RLOCKED)
 		INP_INFO_RUNLOCK(&V_tcbinfo);
 	else if (ti_locked == TI_WLOCKED)
 		INP_INFO_WUNLOCK(&V_tcbinfo);
 #ifdef INVARIANTS
 	else
 		INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
 #endif
 	ti_locked = TI_UNLOCKED;
 
 	/*
 	 * Drop space held by incoming segment and return.
 	 */
 #ifdef TCPDEBUG
 	if (tp == NULL || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
 		tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen,
 			  &tcp_savetcp, 0);
 #endif
 	if (tp != NULL)
 		INP_WUNLOCK(tp->t_inpcb);
 	m_freem(m);
 }
 
 /*
  * Issue RST and make ACK acceptable to originator of segment.
  * The mbuf must still include the original packet header.
  * tp may be NULL.
  */
 static void
 tcp_dropwithreset(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp,
     int tlen, int rstreason)
 {
 	struct ip *ip;
 #ifdef INET6
 	struct ip6_hdr *ip6;
 #endif
 
 	if (tp != NULL) {
 		INP_WLOCK_ASSERT(tp->t_inpcb);
 	}
 
 	/* Don't bother if destination was broadcast/multicast. */
 	if ((th->th_flags & TH_RST) || m->m_flags & (M_BCAST|M_MCAST))
 		goto drop;
 #ifdef INET6
 	if (mtod(m, struct ip *)->ip_v == 6) {
 		ip6 = mtod(m, struct ip6_hdr *);
 		if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) ||
 		    IN6_IS_ADDR_MULTICAST(&ip6->ip6_src))
 			goto drop;
 		/* IPv6 anycast check is done at tcp6_input() */
 	} else
 #endif
 	{
 		ip = mtod(m, struct ip *);
 		if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) ||
 		    IN_MULTICAST(ntohl(ip->ip_src.s_addr)) ||
 		    ip->ip_src.s_addr == htonl(INADDR_BROADCAST) ||
 		    in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif))
 			goto drop;
 	}
 
 	/* Perform bandwidth limiting. */
 	if (badport_bandlim(rstreason) < 0)
 		goto drop;
 
 	/* tcp_respond consumes the mbuf chain. */
 	if (th->th_flags & TH_ACK) {
 		tcp_respond(tp, mtod(m, void *), th, m, (tcp_seq)0,
 		    th->th_ack, TH_RST);
 	} else {
 		if (th->th_flags & TH_SYN)
 			tlen++;
 		tcp_respond(tp, mtod(m, void *), th, m, th->th_seq+tlen,
 		    (tcp_seq)0, TH_RST|TH_ACK);
 	}
 	return;
 drop:
 	m_freem(m);
 }
 
 /*
  * Parse TCP options and place in tcpopt.
  */
 static void
 tcp_dooptions(struct tcpopt *to, u_char *cp, int cnt, int flags)
 {
 	INIT_VNET_INET(curvnet);
 	int opt, optlen;
 
 	to->to_flags = 0;
 	for (; cnt > 0; cnt -= optlen, cp += optlen) {
 		opt = cp[0];
 		if (opt == TCPOPT_EOL)
 			break;
 		if (opt == TCPOPT_NOP)
 			optlen = 1;
 		else {
 			if (cnt < 2)
 				break;
 			optlen = cp[1];
 			if (optlen < 2 || optlen > cnt)
 				break;
 		}
 		switch (opt) {
 		case TCPOPT_MAXSEG:
 			if (optlen != TCPOLEN_MAXSEG)
 				continue;
 			if (!(flags & TO_SYN))
 				continue;
 			to->to_flags |= TOF_MSS;
 			bcopy((char *)cp + 2,
 			    (char *)&to->to_mss, sizeof(to->to_mss));
 			to->to_mss = ntohs(to->to_mss);
 			break;
 		case TCPOPT_WINDOW:
 			if (optlen != TCPOLEN_WINDOW)
 				continue;
 			if (!(flags & TO_SYN))
 				continue;
 			to->to_flags |= TOF_SCALE;
 			to->to_wscale = min(cp[2], TCP_MAX_WINSHIFT);
 			break;
 		case TCPOPT_TIMESTAMP:
 			if (optlen != TCPOLEN_TIMESTAMP)
 				continue;
 			to->to_flags |= TOF_TS;
 			bcopy((char *)cp + 2,
 			    (char *)&to->to_tsval, sizeof(to->to_tsval));
 			to->to_tsval = ntohl(to->to_tsval);
 			bcopy((char *)cp + 6,
 			    (char *)&to->to_tsecr, sizeof(to->to_tsecr));
 			to->to_tsecr = ntohl(to->to_tsecr);
 			break;
 #ifdef TCP_SIGNATURE
 		/*
 		 * XXX In order to reply to a host which has set the
 		 * TCP_SIGNATURE option in its initial SYN, we have to
 		 * record the fact that the option was observed here
 		 * for the syncache code to perform the correct response.
 		 */
 		case TCPOPT_SIGNATURE:
 			if (optlen != TCPOLEN_SIGNATURE)
 				continue;
 			to->to_flags |= TOF_SIGNATURE;
 			to->to_signature = cp + 2;
 			break;
 #endif
 		case TCPOPT_SACK_PERMITTED:
 			if (optlen != TCPOLEN_SACK_PERMITTED)
 				continue;
 			if (!(flags & TO_SYN))
 				continue;
 			if (!V_tcp_do_sack)
 				continue;
 			to->to_flags |= TOF_SACKPERM;
 			break;
 		case TCPOPT_SACK:
 			if (optlen <= 2 || (optlen - 2) % TCPOLEN_SACK != 0)
 				continue;
 			if (flags & TO_SYN)
 				continue;
 			to->to_flags |= TOF_SACK;
 			to->to_nsacks = (optlen - 2) / TCPOLEN_SACK;
 			to->to_sacks = cp + 2;
 			V_tcpstat.tcps_sack_rcv_blocks++;
 			break;
 		default:
 			continue;
 		}
 	}
 }
 
 /*
  * Pull out of band byte out of a segment so
  * it doesn't appear in the user's data queue.
  * It is still reflected in the segment length for
  * sequencing purposes.
  */
 static void
 tcp_pulloutofband(struct socket *so, struct tcphdr *th, struct mbuf *m,
     int off)
 {
 	int cnt = off + th->th_urp - 1;
 
 	while (cnt >= 0) {
 		if (m->m_len > cnt) {
 			char *cp = mtod(m, caddr_t) + cnt;
 			struct tcpcb *tp = sototcpcb(so);
 
 			INP_WLOCK_ASSERT(tp->t_inpcb);
 
 			tp->t_iobc = *cp;
 			tp->t_oobflags |= TCPOOB_HAVEDATA;
 			bcopy(cp+1, cp, (unsigned)(m->m_len - cnt - 1));
 			m->m_len--;
 			if (m->m_flags & M_PKTHDR)
 				m->m_pkthdr.len--;
 			return;
 		}
 		cnt -= m->m_len;
 		m = m->m_next;
 		if (m == NULL)
 			break;
 	}
 	panic("tcp_pulloutofband");
 }
 
 /*
  * Collect new round-trip time estimate
  * and update averages and current timeout.
  */
 static void
 tcp_xmit_timer(struct tcpcb *tp, int rtt)
 {
 	INIT_VNET_INET(tp->t_inpcb->inp_vnet);
 	int delta;
 
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 
 	V_tcpstat.tcps_rttupdated++;
 	tp->t_rttupdated++;
 	if (tp->t_srtt != 0) {
 		/*
 		 * srtt is stored as fixed point with 5 bits after the
 		 * binary point (i.e., scaled by 8).  The following magic
 		 * is equivalent to the smoothing algorithm in rfc793 with
 		 * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed
 		 * point).  Adjust rtt to origin 0.
 		 */
 		delta = ((rtt - 1) << TCP_DELTA_SHIFT)
 			- (tp->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT));
 
 		if ((tp->t_srtt += delta) <= 0)
 			tp->t_srtt = 1;
 
 		/*
 		 * We accumulate a smoothed rtt variance (actually, a
 		 * smoothed mean difference), then set the retransmit
 		 * timer to smoothed rtt + 4 times the smoothed variance.
 		 * rttvar is stored as fixed point with 4 bits after the
 		 * binary point (scaled by 16).  The following is
 		 * equivalent to rfc793 smoothing with an alpha of .75
 		 * (rttvar = rttvar*3/4 + |delta| / 4).  This replaces
 		 * rfc793's wired-in beta.
 		 */
 		if (delta < 0)
 			delta = -delta;
 		delta -= tp->t_rttvar >> (TCP_RTTVAR_SHIFT - TCP_DELTA_SHIFT);
 		if ((tp->t_rttvar += delta) <= 0)
 			tp->t_rttvar = 1;
 		if (tp->t_rttbest > tp->t_srtt + tp->t_rttvar)
 		    tp->t_rttbest = tp->t_srtt + tp->t_rttvar;
 	} else {
 		/*
 		 * No rtt measurement yet - use the unsmoothed rtt.
 		 * Set the variance to half the rtt (so our first
 		 * retransmit happens at 3*rtt).
 		 */
 		tp->t_srtt = rtt << TCP_RTT_SHIFT;
 		tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - 1);
 		tp->t_rttbest = tp->t_srtt + tp->t_rttvar;
 	}
 	tp->t_rtttime = 0;
 	tp->t_rxtshift = 0;
 
 	/*
 	 * the retransmit should happen at rtt + 4 * rttvar.
 	 * Because of the way we do the smoothing, srtt and rttvar
 	 * will each average +1/2 tick of bias.  When we compute
 	 * the retransmit timer, we want 1/2 tick of rounding and
 	 * 1 extra tick because of +-1/2 tick uncertainty in the
 	 * firing of the timer.  The bias will give us exactly the
 	 * 1.5 tick we need.  But, because the bias is
 	 * statistical, we have to test that we don't drop below
 	 * the minimum feasible timer (which is 2 ticks).
 	 */
 	TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp),
 		      max(tp->t_rttmin, rtt + 2), TCPTV_REXMTMAX);
 
 	/*
 	 * We received an ack for a packet that wasn't retransmitted;
 	 * it is probably safe to discard any error indications we've
 	 * received recently.  This isn't quite right, but close enough
 	 * for now (a route might have failed after we sent a segment,
 	 * and the return path might not be symmetrical).
 	 */
 	tp->t_softerror = 0;
 }
 
 /*
  * Determine a reasonable value for maxseg size.
  * If the route is known, check route for mtu.
  * If none, use an mss that can be handled on the outgoing
  * interface without forcing IP to fragment; if bigger than
  * an mbuf cluster (MCLBYTES), round down to nearest multiple of MCLBYTES
  * to utilize large mbufs.  If no route is found, route has no mtu,
  * or the destination isn't local, use a default, hopefully conservative
  * size (usually 512 or the default IP max size, but no more than the mtu
  * of the interface), as we can't discover anything about intervening
  * gateways or networks.  We also initialize the congestion/slow start
  * window to be a single segment if the destination isn't local.
  * While looking at the routing entry, we also initialize other path-dependent
  * parameters from pre-set or cached values in the routing entry.
  *
  * Also take into account the space needed for options that we
  * send regularly.  Make maxseg shorter by that amount to assure
  * that we can send maxseg amount of data even when the options
  * are present.  Store the upper limit of the length of options plus
  * data in maxopd.
  *
  * In case of T/TCP, we call this routine during implicit connection
  * setup as well (offer = -1), to initialize maxseg from the cached
  * MSS of our peer.
  *
  * NOTE that this routine is only called when we process an incoming
  * segment. Outgoing SYN/ACK MSS settings are handled in tcp_mssopt().
  */
 void
 tcp_mss_update(struct tcpcb *tp, int offer,
     struct hc_metrics_lite *metricptr, int *mtuflags)
 {
 	INIT_VNET_INET(tp->t_inpcb->inp_vnet);
 	int mss;
 	u_long maxmtu;
 	struct inpcb *inp = tp->t_inpcb;
 	struct hc_metrics_lite metrics;
 	int origoffer = offer;
 #ifdef INET6
 	int isipv6 = ((inp->inp_vflag & INP_IPV6) != 0) ? 1 : 0;
 	size_t min_protoh = isipv6 ?
 			    sizeof (struct ip6_hdr) + sizeof (struct tcphdr) :
 			    sizeof (struct tcpiphdr);
 #else
 	const size_t min_protoh = sizeof(struct tcpiphdr);
 #endif
 
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 
 	/* Initialize. */
 #ifdef INET6
 	if (isipv6) {
 		maxmtu = tcp_maxmtu6(&inp->inp_inc, mtuflags);
 		tp->t_maxopd = tp->t_maxseg = V_tcp_v6mssdflt;
 	} else
 #endif
 	{
 		maxmtu = tcp_maxmtu(&inp->inp_inc, mtuflags);
 		tp->t_maxopd = tp->t_maxseg = V_tcp_mssdflt;
 	}
 
 	/*
 	 * No route to sender, stay with default mss and return.
 	 */
 	if (maxmtu == 0) {
 		/*
 		 * In case we return early we need to initialize metrics
 		 * to a defined state as tcp_hc_get() would do for us
 		 * if there was no cache hit.
 		 */
 		if (metricptr != NULL)
 			bzero(metricptr, sizeof(struct hc_metrics_lite));
 		return;
 	}
 
 	/* What have we got? */
 	switch (offer) {
 		case 0:
 			/*
 			 * Offer == 0 means that there was no MSS on the SYN
 			 * segment, in this case we use tcp_mssdflt as
 			 * already assigned to t_maxopd above.
 			 */
 			offer = tp->t_maxopd;
 			break;
 
 		case -1:
 			/*
 			 * Offer == -1 means that we didn't receive SYN yet.
 			 */
 			/* FALLTHROUGH */
 
 		default:
 			/*
 			 * Prevent DoS attack with too small MSS. Round up
 			 * to at least minmss.
 			 */
 			offer = max(offer, V_tcp_minmss);
 	}
 
 	/*
 	 * rmx information is now retrieved from tcp_hostcache.
 	 */
 	tcp_hc_get(&inp->inp_inc, &metrics);
 	if (metricptr != NULL)
 		bcopy(&metrics, metricptr, sizeof(struct hc_metrics_lite));
 
 	/*
 	 * If there's a discovered mtu int tcp hostcache, use it
 	 * else, use the link mtu.
 	 */
 	if (metrics.rmx_mtu)
 		mss = min(metrics.rmx_mtu, maxmtu) - min_protoh;
 	else {
 #ifdef INET6
 		if (isipv6) {
 			mss = maxmtu - min_protoh;
 			if (!V_path_mtu_discovery &&
 			    !in6_localaddr(&inp->in6p_faddr))
 				mss = min(mss, V_tcp_v6mssdflt);
 		} else
 #endif
 		{
 			mss = maxmtu - min_protoh;
 			if (!V_path_mtu_discovery &&
 			    !in_localaddr(inp->inp_faddr))
 				mss = min(mss, V_tcp_mssdflt);
 		}
 		/*
 		 * XXX - The above conditional (mss = maxmtu - min_protoh)
 		 * probably violates the TCP spec.
 		 * The problem is that, since we don't know the
 		 * other end's MSS, we are supposed to use a conservative
 		 * default.  But, if we do that, then MTU discovery will
 		 * never actually take place, because the conservative
 		 * default is much less than the MTUs typically seen
 		 * on the Internet today.  For the moment, we'll sweep
 		 * this under the carpet.
 		 *
 		 * The conservative default might not actually be a problem
 		 * if the only case this occurs is when sending an initial
 		 * SYN with options and data to a host we've never talked
 		 * to before.  Then, they will reply with an MSS value which
 		 * will get recorded and the new parameters should get
 		 * recomputed.  For Further Study.
 		 */
 	}
 	mss = min(mss, offer);
 
 	/*
 	 * Sanity check: make sure that maxopd will be large
 	 * enough to allow some data on segments even if the
 	 * all the option space is used (40bytes).  Otherwise
 	 * funny things may happen in tcp_output.
 	 */
 	mss = max(mss, 64);
 
 	/*
 	 * maxopd stores the maximum length of data AND options
 	 * in a segment; maxseg is the amount of data in a normal
 	 * segment.  We need to store this value (maxopd) apart
 	 * from maxseg, because now every segment carries options
 	 * and thus we normally have somewhat less data in segments.
 	 */
 	tp->t_maxopd = mss;
 
 	/*
 	 * origoffer==-1 indicates that no segments were received yet.
 	 * In this case we just guess.
 	 */
 	if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP &&
 	    (origoffer == -1 ||
 	     (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP))
 		mss -= TCPOLEN_TSTAMP_APPA;
 
 #if	(MCLBYTES & (MCLBYTES - 1)) == 0
 	if (mss > MCLBYTES)
 		mss &= ~(MCLBYTES-1);
 #else
 	if (mss > MCLBYTES)
 		mss = mss / MCLBYTES * MCLBYTES;
 #endif
 	tp->t_maxseg = mss;
 }
 
 void
 tcp_mss(struct tcpcb *tp, int offer)
 {
 	int rtt, mss;
 	u_long bufsize;
 	struct inpcb *inp;
 	struct socket *so;
 	struct hc_metrics_lite metrics;
 	int mtuflags = 0;
 #ifdef INET6
 	int isipv6;
 #endif
 	KASSERT(tp != NULL, ("%s: tp == NULL", __func__));
 	INIT_VNET_INET(tp->t_vnet);
 	
 	tcp_mss_update(tp, offer, &metrics, &mtuflags);
 
 	mss = tp->t_maxseg;
 	inp = tp->t_inpcb;
 #ifdef INET6
 	isipv6 = ((inp->inp_vflag & INP_IPV6) != 0) ? 1 : 0;
 #endif
 
 	/*
 	 * If there's a pipesize, change the socket buffer to that size,
 	 * don't change if sb_hiwat is different than default (then it
 	 * has been changed on purpose with setsockopt).
 	 * Make the socket buffers an integral number of mss units;
 	 * if the mss is larger than the socket buffer, decrease the mss.
 	 */
 	so = inp->inp_socket;
 	SOCKBUF_LOCK(&so->so_snd);
 	if ((so->so_snd.sb_hiwat == tcp_sendspace) && metrics.rmx_sendpipe)
 		bufsize = metrics.rmx_sendpipe;
 	else
 		bufsize = so->so_snd.sb_hiwat;
 	if (bufsize < mss)
 		mss = bufsize;
 	else {
 		bufsize = roundup(bufsize, mss);
 		if (bufsize > sb_max)
 			bufsize = sb_max;
 		if (bufsize > so->so_snd.sb_hiwat)
 			(void)sbreserve_locked(&so->so_snd, bufsize, so, NULL);
 	}
 	SOCKBUF_UNLOCK(&so->so_snd);
 	tp->t_maxseg = mss;
 
 	SOCKBUF_LOCK(&so->so_rcv);
 	if ((so->so_rcv.sb_hiwat == tcp_recvspace) && metrics.rmx_recvpipe)
 		bufsize = metrics.rmx_recvpipe;
 	else
 		bufsize = so->so_rcv.sb_hiwat;
 	if (bufsize > mss) {
 		bufsize = roundup(bufsize, mss);
 		if (bufsize > sb_max)
 			bufsize = sb_max;
 		if (bufsize > so->so_rcv.sb_hiwat)
 			(void)sbreserve_locked(&so->so_rcv, bufsize, so, NULL);
 	}
 	SOCKBUF_UNLOCK(&so->so_rcv);
 	/*
 	 * While we're here, check the others too.
 	 */
 	if (tp->t_srtt == 0 && (rtt = metrics.rmx_rtt)) {
 		tp->t_srtt = rtt;
 		tp->t_rttbest = tp->t_srtt + TCP_RTT_SCALE;
 		V_tcpstat.tcps_usedrtt++;
 		if (metrics.rmx_rttvar) {
 			tp->t_rttvar = metrics.rmx_rttvar;
 			V_tcpstat.tcps_usedrttvar++;
 		} else {
 			/* default variation is +- 1 rtt */
 			tp->t_rttvar =
 			    tp->t_srtt * TCP_RTTVAR_SCALE / TCP_RTT_SCALE;
 		}
 		TCPT_RANGESET(tp->t_rxtcur,
 			      ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1,
 			      tp->t_rttmin, TCPTV_REXMTMAX);
 	}
 	if (metrics.rmx_ssthresh) {
 		/*
 		 * There's some sort of gateway or interface
 		 * buffer limit on the path.  Use this to set
 		 * the slow start threshhold, but set the
 		 * threshold to no less than 2*mss.
 		 */
 		tp->snd_ssthresh = max(2 * mss, metrics.rmx_ssthresh);
 		V_tcpstat.tcps_usedssthresh++;
 	}
 	if (metrics.rmx_bandwidth)
 		tp->snd_bandwidth = metrics.rmx_bandwidth;
 
 	/*
 	 * Set the slow-start flight size depending on whether this
 	 * is a local network or not.
 	 *
 	 * Extend this so we cache the cwnd too and retrieve it here.
 	 * Make cwnd even bigger than RFC3390 suggests but only if we
 	 * have previous experience with the remote host. Be careful
 	 * not make cwnd bigger than remote receive window or our own
 	 * send socket buffer. Maybe put some additional upper bound
 	 * on the retrieved cwnd. Should do incremental updates to
 	 * hostcache when cwnd collapses so next connection doesn't
 	 * overloads the path again.
 	 *
 	 * RFC3390 says only do this if SYN or SYN/ACK didn't got lost.
 	 * We currently check only in syncache_socket for that.
 	 */
 #define TCP_METRICS_CWND
 #ifdef TCP_METRICS_CWND
 	if (metrics.rmx_cwnd)
 		tp->snd_cwnd = max(mss,
 				min(metrics.rmx_cwnd / 2,
 				 min(tp->snd_wnd, so->so_snd.sb_hiwat)));
 	else
 #endif
 	if (V_tcp_do_rfc3390)
 		tp->snd_cwnd = min(4 * mss, max(2 * mss, 4380));
 #ifdef INET6
 	else if ((isipv6 && in6_localaddr(&inp->in6p_faddr)) ||
 		 (!isipv6 && in_localaddr(inp->inp_faddr)))
 #else
 	else if (in_localaddr(inp->inp_faddr))
 #endif
 		tp->snd_cwnd = mss * V_ss_fltsz_local;
 	else
 		tp->snd_cwnd = mss * V_ss_fltsz;
 
 	/* Check the interface for TSO capabilities. */
 	if (mtuflags & CSUM_TSO)
 		tp->t_flags |= TF_TSO;
 }
 
 /*
  * Determine the MSS option to send on an outgoing SYN.
  */
 int
 tcp_mssopt(struct in_conninfo *inc)
 {
 	INIT_VNET_INET(curvnet);
 	int mss = 0;
 	u_long maxmtu = 0;
 	u_long thcmtu = 0;
 	size_t min_protoh;
-#ifdef INET6
-	int isipv6 = inc->inc_isipv6 ? 1 : 0;
-#endif
 
 	KASSERT(inc != NULL, ("tcp_mssopt with NULL in_conninfo pointer"));
 
 #ifdef INET6
-	if (isipv6) {
+	if (inc->inc_flags & INC_ISIPV6) {
 		mss = V_tcp_v6mssdflt;
 		maxmtu = tcp_maxmtu6(inc, NULL);
 		thcmtu = tcp_hc_getmtu(inc); /* IPv4 and IPv6 */
 		min_protoh = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
 	} else
 #endif
 	{
 		mss = V_tcp_mssdflt;
 		maxmtu = tcp_maxmtu(inc, NULL);
 		thcmtu = tcp_hc_getmtu(inc); /* IPv4 and IPv6 */
 		min_protoh = sizeof(struct tcpiphdr);
 	}
 	if (maxmtu && thcmtu)
 		mss = min(maxmtu, thcmtu) - min_protoh;
 	else if (maxmtu || thcmtu)
 		mss = max(maxmtu, thcmtu) - min_protoh;
 
 	return (mss);
 }
 
 
 /*
  * On a partial ack arrives, force the retransmission of the
  * next unacknowledged segment.  Do not clear tp->t_dupacks.
  * By setting snd_nxt to ti_ack, this forces retransmission timer to
  * be started again.
  */
 static void
 tcp_newreno_partial_ack(struct tcpcb *tp, struct tcphdr *th)
 {
 	tcp_seq onxt = tp->snd_nxt;
 	u_long  ocwnd = tp->snd_cwnd;
 
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 
 	tcp_timer_activate(tp, TT_REXMT, 0);
 	tp->t_rtttime = 0;
 	tp->snd_nxt = th->th_ack;
 	/*
 	 * Set snd_cwnd to one segment beyond acknowledged offset.
 	 * (tp->snd_una has not yet been updated when this function is called.)
 	 */
 	tp->snd_cwnd = tp->t_maxseg + (th->th_ack - tp->snd_una);
 	tp->t_flags |= TF_ACKNOW;
 	(void) tcp_output(tp);
 	tp->snd_cwnd = ocwnd;
 	if (SEQ_GT(onxt, tp->snd_nxt))
 		tp->snd_nxt = onxt;
 	/*
 	 * Partial window deflation.  Relies on fact that tp->snd_una
 	 * not updated yet.
 	 */
 	if (tp->snd_cwnd > th->th_ack - tp->snd_una)
 		tp->snd_cwnd -= th->th_ack - tp->snd_una;
 	else
 		tp->snd_cwnd = 0;
 	tp->snd_cwnd += tp->t_maxseg;
 }
Index: head/sys/netinet/tcp_subr.c
===================================================================
--- head/sys/netinet/tcp_subr.c	(revision 186221)
+++ head/sys/netinet/tcp_subr.c	(revision 186222)
@@ -1,2310 +1,2308 @@
 /*-
  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)tcp_subr.c	8.2 (Berkeley) 5/24/95
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_compat.h"
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_ipsec.h"
 #include "opt_mac.h"
 #include "opt_tcpdebug.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/callout.h>
 #include <sys/kernel.h>
 #include <sys/sysctl.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #ifdef INET6
 #include <sys/domain.h>
 #endif
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/protosw.h>
 #include <sys/random.h>
 #include <sys/vimage.h>
 
 #include <vm/uma.h>
 
 #include <net/route.h>
 #include <net/if.h>
 
 #include <netinet/in.h>
 #include <netinet/in_systm.h>
 #include <netinet/ip.h>
 #ifdef INET6
 #include <netinet/ip6.h>
 #endif
 #include <netinet/in_pcb.h>
 #ifdef INET6
 #include <netinet6/in6_pcb.h>
 #endif
 #include <netinet/in_var.h>
 #include <netinet/ip_var.h>
 #ifdef INET6
 #include <netinet6/ip6_var.h>
 #include <netinet6/scope6_var.h>
 #include <netinet6/nd6.h>
 #endif
 #include <netinet/ip_icmp.h>
 #include <netinet/tcp.h>
 #include <netinet/tcp_fsm.h>
 #include <netinet/tcp_seq.h>
 #include <netinet/tcp_timer.h>
 #include <netinet/tcp_var.h>
 #include <netinet/tcp_syncache.h>
 #include <netinet/tcp_offload.h>
 #ifdef INET6
 #include <netinet6/tcp6_var.h>
 #endif
 #include <netinet/tcpip.h>
 #ifdef TCPDEBUG
 #include <netinet/tcp_debug.h>
 #endif
 #include <netinet/vinet.h>
 #include <netinet6/ip6protosw.h>
 #include <netinet6/vinet6.h>
 
 #ifdef IPSEC
 #include <netipsec/ipsec.h>
 #include <netipsec/xform.h>
 #ifdef INET6
 #include <netipsec/ipsec6.h>
 #endif
 #include <netipsec/key.h>
 #include <sys/syslog.h>
 #endif /*IPSEC*/
 
 #include <machine/in_cksum.h>
 #include <sys/md5.h>
 
 #include <security/mac/mac_framework.h>
 
 #ifdef VIMAGE_GLOBALS
 int	tcp_mssdflt;
 #ifdef INET6
 int	tcp_v6mssdflt;
 #endif
 int	tcp_minmss;
 int	tcp_do_rfc1323;
 static int	icmp_may_rst;
 static int	tcp_isn_reseed_interval;
 static int	tcp_inflight_enable;
 static int	tcp_inflight_rttthresh;
 static int	tcp_inflight_min;
 static int	tcp_inflight_max;
 static int	tcp_inflight_stab;
 #endif
 
 static int
 sysctl_net_inet_tcp_mss_check(SYSCTL_HANDLER_ARGS)
 {
 	INIT_VNET_INET(curvnet);
 	int error, new;
 
 	new = V_tcp_mssdflt;
 	error = sysctl_handle_int(oidp, &new, 0, req);
 	if (error == 0 && req->newptr) {
 		if (new < TCP_MINMSS)
 			error = EINVAL;
 		else
 			V_tcp_mssdflt = new;
 	}
 	return (error);
 }
 
 SYSCTL_V_PROC(V_NET, vnet_inet, _net_inet_tcp, TCPCTL_MSSDFLT, mssdflt,
     CTLTYPE_INT|CTLFLAG_RW, tcp_mssdflt, 0,
     &sysctl_net_inet_tcp_mss_check, "I",
     "Default TCP Maximum Segment Size");
 
 #ifdef INET6
 static int
 sysctl_net_inet_tcp_mss_v6_check(SYSCTL_HANDLER_ARGS)
 {
 	INIT_VNET_INET(curvnet);
 	int error, new;
 
 	new = V_tcp_v6mssdflt;
 	error = sysctl_handle_int(oidp, &new, 0, req);
 	if (error == 0 && req->newptr) {
 		if (new < TCP_MINMSS)
 			error = EINVAL;
 		else
 			V_tcp_v6mssdflt = new;
 	}
 	return (error);
 }
 
 SYSCTL_V_PROC(V_NET, vnet_inet, _net_inet_tcp, TCPCTL_V6MSSDFLT, v6mssdflt,
     CTLTYPE_INT|CTLFLAG_RW, tcp_v6mssdflt, 0,
     &sysctl_net_inet_tcp_mss_v6_check, "I",
    "Default TCP Maximum Segment Size for IPv6");
 #endif
 
 /*
  * Minimum MSS we accept and use. This prevents DoS attacks where
  * we are forced to a ridiculous low MSS like 20 and send hundreds
  * of packets instead of one. The effect scales with the available
  * bandwidth and quickly saturates the CPU and network interface
  * with packet generation and sending. Set to zero to disable MINMSS
  * checking. This setting prevents us from sending too small packets.
  */
 SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, minmss,
     CTLFLAG_RW, tcp_minmss , 0, "Minmum TCP Maximum Segment Size");
 
 SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, TCPCTL_DO_RFC1323, rfc1323,
     CTLFLAG_RW, tcp_do_rfc1323, 0,
     "Enable rfc1323 (high performance TCP) extensions");
 
 static int	tcp_log_debug = 0;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_debug, CTLFLAG_RW,
     &tcp_log_debug, 0, "Log errors caused by incoming TCP segments");
 
 static int	tcp_tcbhashsize = 0;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcbhashsize, CTLFLAG_RDTUN,
     &tcp_tcbhashsize, 0, "Size of TCP control-block hashtable");
 
 static int	do_tcpdrain = 1;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, do_tcpdrain, CTLFLAG_RW, &do_tcpdrain, 0,
     "Enable tcp_drain routine for extra help when low on mbufs");
 
 SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, pcbcount,
     CTLFLAG_RD, tcbinfo.ipi_count, 0, "Number of active PCBs");
 
 SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, icmp_may_rst,
     CTLFLAG_RW, icmp_may_rst, 0,
     "Certain ICMP unreachable messages may abort connections in SYN_SENT");
 
 SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, isn_reseed_interval,
     CTLFLAG_RW, tcp_isn_reseed_interval, 0,
     "Seconds between reseeding of ISN secret");
 
 /*
  * TCP bandwidth limiting sysctls.  Note that the default lower bound of
  * 1024 exists only for debugging.  A good production default would be
  * something like 6100.
  */
 SYSCTL_NODE(_net_inet_tcp, OID_AUTO, inflight, CTLFLAG_RW, 0,
     "TCP inflight data limiting");
 
 SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_inflight, OID_AUTO, enable,
     CTLFLAG_RW, tcp_inflight_enable, 0,
     "Enable automatic TCP inflight data limiting");
 
 static int	tcp_inflight_debug = 0;
 SYSCTL_INT(_net_inet_tcp_inflight, OID_AUTO, debug, CTLFLAG_RW,
     &tcp_inflight_debug, 0, "Debug TCP inflight calculations");
 
 SYSCTL_V_PROC(V_NET, vnet_inet, _net_inet_tcp_inflight, OID_AUTO, rttthresh,
     CTLTYPE_INT|CTLFLAG_RW, tcp_inflight_rttthresh, 0, sysctl_msec_to_ticks,
     "I", "RTT threshold below which inflight will deactivate itself");
 
 SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_inflight, OID_AUTO, min,
     CTLFLAG_RW, tcp_inflight_min, 0, "Lower-bound for TCP inflight window");
 
 SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_inflight, OID_AUTO, max,
     CTLFLAG_RW, tcp_inflight_max, 0, "Upper-bound for TCP inflight window");
 
 SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_inflight, OID_AUTO, stab,
     CTLFLAG_RW, tcp_inflight_stab, 0,
     "Inflight Algorithm Stabilization 20 = 2 packets");
 
 uma_zone_t sack_hole_zone;
 
 static struct inpcb *tcp_notify(struct inpcb *, int);
 static void	tcp_isn_tick(void *);
 
 /*
  * Target size of TCP PCB hash tables. Must be a power of two.
  *
  * Note that this can be overridden by the kernel environment
  * variable net.inet.tcp.tcbhashsize
  */
 #ifndef TCBHASHSIZE
 #define TCBHASHSIZE	512
 #endif
 
 /*
  * XXX
  * Callouts should be moved into struct tcp directly.  They are currently
  * separate because the tcpcb structure is exported to userland for sysctl
  * parsing purposes, which do not know about callouts.
  */
 struct tcpcb_mem {
 	struct	tcpcb		tcb;
 	struct	tcp_timer	tt;
 };
 
 static uma_zone_t tcpcb_zone;
 MALLOC_DEFINE(M_TCPLOG, "tcplog", "TCP address and flags print buffers");
 struct callout isn_callout;
 static struct mtx isn_mtx;
 
 #define	ISN_LOCK_INIT()	mtx_init(&isn_mtx, "isn_mtx", NULL, MTX_DEF)
 #define	ISN_LOCK()	mtx_lock(&isn_mtx)
 #define	ISN_UNLOCK()	mtx_unlock(&isn_mtx)
 
 /*
  * TCP initialization.
  */
 static void
 tcp_zone_change(void *tag)
 {
 
 	uma_zone_set_max(V_tcbinfo.ipi_zone, maxsockets);
 	uma_zone_set_max(tcpcb_zone, maxsockets);
 	tcp_tw_zone_change();
 }
 
 static int
 tcp_inpcb_init(void *mem, int size, int flags)
 {
 	struct inpcb *inp = mem;
 
 	INP_LOCK_INIT(inp, "inp", "tcpinp");
 	return (0);
 }
 
 void
 tcp_init(void)
 {
 	INIT_VNET_INET(curvnet);
 	int hashsize;
 
 	V_blackhole = 0;
 	V_tcp_delack_enabled = 1;
 	V_drop_synfin = 0;
 	V_tcp_do_rfc3042 = 1;
 	V_tcp_do_rfc3390 = 1;
 	V_tcp_do_ecn = 0;
 	V_tcp_ecn_maxretries = 1;
 	V_tcp_insecure_rst = 0;
 	V_tcp_do_autorcvbuf = 1;
 	V_tcp_autorcvbuf_inc = 16*1024;
 	V_tcp_autorcvbuf_max = 256*1024;
 
 	V_tcp_mssdflt = TCP_MSS;
 #ifdef INET6
 	V_tcp_v6mssdflt = TCP6_MSS;
 #endif
 	V_tcp_minmss = TCP_MINMSS;
 	V_tcp_do_rfc1323 = 1;
 	V_icmp_may_rst = 1;
 	V_tcp_isn_reseed_interval = 0;
 	V_tcp_inflight_enable = 1;
 	V_tcp_inflight_min = 6144;
 	V_tcp_inflight_max = TCP_MAXWIN << TCP_MAX_WINSHIFT;
 	V_tcp_inflight_stab = 20;
 
 	V_path_mtu_discovery = 1;
 	V_ss_fltsz = 1;
 	V_ss_fltsz_local = 4;
 	V_tcp_do_newreno = 1;
 	V_tcp_do_tso = 1;
 	V_tcp_do_autosndbuf = 1;
 	V_tcp_autosndbuf_inc = 8*1024;
 	V_tcp_autosndbuf_max = 256*1024;
 
 	V_nolocaltimewait = 0;
 
 	V_tcp_do_sack = 1;
 	V_tcp_sack_maxholes = 128;
 	V_tcp_sack_globalmaxholes = 65536;
 	V_tcp_sack_globalholes = 0;
 
 	tcp_delacktime = TCPTV_DELACK;
 	tcp_keepinit = TCPTV_KEEP_INIT;
 	tcp_keepidle = TCPTV_KEEP_IDLE;
 	tcp_keepintvl = TCPTV_KEEPINTVL;
 	tcp_maxpersistidle = TCPTV_KEEP_IDLE;
 	tcp_msl = TCPTV_MSL;
 	tcp_rexmit_min = TCPTV_MIN;
 	if (tcp_rexmit_min < 1)
 		tcp_rexmit_min = 1;
 	tcp_rexmit_slop = TCPTV_CPU_VAR;
 	V_tcp_inflight_rttthresh = TCPTV_INFLIGHT_RTTTHRESH;
 	tcp_finwait2_timeout = TCPTV_FINWAIT2_TIMEOUT;
 
 	TUNABLE_INT_FETCH("net.inet.tcp.sack.enable", &V_tcp_do_sack);
 
 	INP_INFO_LOCK_INIT(&V_tcbinfo, "tcp");
 	LIST_INIT(&V_tcb);
 	V_tcbinfo.ipi_listhead = &V_tcb;
 	hashsize = TCBHASHSIZE;
 	TUNABLE_INT_FETCH("net.inet.tcp.tcbhashsize", &hashsize);
 	if (!powerof2(hashsize)) {
 		printf("WARNING: TCB hash size not a power of 2\n");
 		hashsize = 512; /* safe default */
 	}
 	tcp_tcbhashsize = hashsize;
 	V_tcbinfo.ipi_hashbase = hashinit(hashsize, M_PCB,
 	    &V_tcbinfo.ipi_hashmask);
 	V_tcbinfo.ipi_porthashbase = hashinit(hashsize, M_PCB,
 	    &V_tcbinfo.ipi_porthashmask);
 	V_tcbinfo.ipi_zone = uma_zcreate("inpcb", sizeof(struct inpcb),
 	    NULL, NULL, tcp_inpcb_init, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 	uma_zone_set_max(V_tcbinfo.ipi_zone, maxsockets);
 #ifdef INET6
 #define TCP_MINPROTOHDR (sizeof(struct ip6_hdr) + sizeof(struct tcphdr))
 #else /* INET6 */
 #define TCP_MINPROTOHDR (sizeof(struct tcpiphdr))
 #endif /* INET6 */
 	if (max_protohdr < TCP_MINPROTOHDR)
 		max_protohdr = TCP_MINPROTOHDR;
 	if (max_linkhdr + TCP_MINPROTOHDR > MHLEN)
 		panic("tcp_init");
 #undef TCP_MINPROTOHDR
 	/*
 	 * These have to be type stable for the benefit of the timers.
 	 */
 	tcpcb_zone = uma_zcreate("tcpcb", sizeof(struct tcpcb_mem),
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 	uma_zone_set_max(tcpcb_zone, maxsockets);
 	tcp_tw_init();
 	syncache_init();
 	tcp_hc_init();
 	tcp_reass_init();
 	ISN_LOCK_INIT();
 	callout_init(&isn_callout, CALLOUT_MPSAFE);
 	tcp_isn_tick(NULL);
 	EVENTHANDLER_REGISTER(shutdown_pre_sync, tcp_fini, NULL,
 		SHUTDOWN_PRI_DEFAULT);
 	sack_hole_zone = uma_zcreate("sackhole", sizeof(struct sackhole),
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 	EVENTHANDLER_REGISTER(maxsockets_change, tcp_zone_change, NULL,
 		EVENTHANDLER_PRI_ANY);
 }
 
 void
 tcp_fini(void *xtp)
 {
 
 	callout_stop(&isn_callout);
 }
 
 /*
  * Fill in the IP and TCP headers for an outgoing packet, given the tcpcb.
  * tcp_template used to store this data in mbufs, but we now recopy it out
  * of the tcpcb each time to conserve mbufs.
  */
 void
 tcpip_fillheaders(struct inpcb *inp, void *ip_ptr, void *tcp_ptr)
 {
 	struct tcphdr *th = (struct tcphdr *)tcp_ptr;
 
 	INP_WLOCK_ASSERT(inp);
 
 #ifdef INET6
 	if ((inp->inp_vflag & INP_IPV6) != 0) {
 		struct ip6_hdr *ip6;
 
 		ip6 = (struct ip6_hdr *)ip_ptr;
 		ip6->ip6_flow = (ip6->ip6_flow & ~IPV6_FLOWINFO_MASK) |
 			(inp->inp_flow & IPV6_FLOWINFO_MASK);
 		ip6->ip6_vfc = (ip6->ip6_vfc & ~IPV6_VERSION_MASK) |
 			(IPV6_VERSION & IPV6_VERSION_MASK);
 		ip6->ip6_nxt = IPPROTO_TCP;
 		ip6->ip6_plen = htons(sizeof(struct tcphdr));
 		ip6->ip6_src = inp->in6p_laddr;
 		ip6->ip6_dst = inp->in6p_faddr;
 	} else
 #endif
 	{
 		struct ip *ip;
 
 		ip = (struct ip *)ip_ptr;
 		ip->ip_v = IPVERSION;
 		ip->ip_hl = 5;
 		ip->ip_tos = inp->inp_ip_tos;
 		ip->ip_len = 0;
 		ip->ip_id = 0;
 		ip->ip_off = 0;
 		ip->ip_ttl = inp->inp_ip_ttl;
 		ip->ip_sum = 0;
 		ip->ip_p = IPPROTO_TCP;
 		ip->ip_src = inp->inp_laddr;
 		ip->ip_dst = inp->inp_faddr;
 	}
 	th->th_sport = inp->inp_lport;
 	th->th_dport = inp->inp_fport;
 	th->th_seq = 0;
 	th->th_ack = 0;
 	th->th_x2 = 0;
 	th->th_off = 5;
 	th->th_flags = 0;
 	th->th_win = 0;
 	th->th_urp = 0;
 	th->th_sum = 0;		/* in_pseudo() is called later for ipv4 */
 }
 
 /*
  * Create template to be used to send tcp packets on a connection.
  * Allocates an mbuf and fills in a skeletal tcp/ip header.  The only
  * use for this function is in keepalives, which use tcp_respond.
  */
 struct tcptemp *
 tcpip_maketemplate(struct inpcb *inp)
 {
 	struct tcptemp *t;
 
 	t = malloc(sizeof(*t), M_TEMP, M_NOWAIT);
 	if (t == NULL)
 		return (NULL);
 	tcpip_fillheaders(inp, (void *)&t->tt_ipgen, (void *)&t->tt_t);
 	return (t);
 }
 
 /*
  * Send a single message to the TCP at address specified by
  * the given TCP/IP header.  If m == NULL, then we make a copy
  * of the tcpiphdr at ti and send directly to the addressed host.
  * This is used to force keep alive messages out using the TCP
  * template for a connection.  If flags are given then we send
  * a message back to the TCP which originated the * segment ti,
  * and discard the mbuf containing it and any other attached mbufs.
  *
  * In any case the ack and sequence number of the transmitted
  * segment are as specified by the parameters.
  *
  * NOTE: If m != NULL, then ti must point to *inside* the mbuf.
  */
 void
 tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m,
     tcp_seq ack, tcp_seq seq, int flags)
 {
 	INIT_VNET_INET(curvnet);
 	int tlen;
 	int win = 0;
 	struct ip *ip;
 	struct tcphdr *nth;
 #ifdef INET6
 	struct ip6_hdr *ip6;
 	int isipv6;
 #endif /* INET6 */
 	int ipflags = 0;
 	struct inpcb *inp;
 
 	KASSERT(tp != NULL || m != NULL, ("tcp_respond: tp and m both NULL"));
 
 #ifdef INET6
 	isipv6 = ((struct ip *)ipgen)->ip_v == 6;
 	ip6 = ipgen;
 #endif /* INET6 */
 	ip = ipgen;
 
 	if (tp != NULL) {
 		inp = tp->t_inpcb;
 		KASSERT(inp != NULL, ("tcp control block w/o inpcb"));
 		INP_WLOCK_ASSERT(inp);
 	} else
 		inp = NULL;
 
 	if (tp != NULL) {
 		if (!(flags & TH_RST)) {
 			win = sbspace(&inp->inp_socket->so_rcv);
 			if (win > (long)TCP_MAXWIN << tp->rcv_scale)
 				win = (long)TCP_MAXWIN << tp->rcv_scale;
 		}
 	}
 	if (m == NULL) {
 		m = m_gethdr(M_DONTWAIT, MT_DATA);
 		if (m == NULL)
 			return;
 		tlen = 0;
 		m->m_data += max_linkhdr;
 #ifdef INET6
 		if (isipv6) {
 			bcopy((caddr_t)ip6, mtod(m, caddr_t),
 			      sizeof(struct ip6_hdr));
 			ip6 = mtod(m, struct ip6_hdr *);
 			nth = (struct tcphdr *)(ip6 + 1);
 		} else
 #endif /* INET6 */
 	      {
 		bcopy((caddr_t)ip, mtod(m, caddr_t), sizeof(struct ip));
 		ip = mtod(m, struct ip *);
 		nth = (struct tcphdr *)(ip + 1);
 	      }
 		bcopy((caddr_t)th, (caddr_t)nth, sizeof(struct tcphdr));
 		flags = TH_ACK;
 	} else {
 		/*
 		 *  reuse the mbuf. 
 		 * XXX MRT We inherrit the FIB, which is lucky.
 		 */
 		m_freem(m->m_next);
 		m->m_next = NULL;
 		m->m_data = (caddr_t)ipgen;
 		/* m_len is set later */
 		tlen = 0;
 #define xchg(a,b,type) { type t; t=a; a=b; b=t; }
 #ifdef INET6
 		if (isipv6) {
 			xchg(ip6->ip6_dst, ip6->ip6_src, struct in6_addr);
 			nth = (struct tcphdr *)(ip6 + 1);
 		} else
 #endif /* INET6 */
 	      {
 		xchg(ip->ip_dst.s_addr, ip->ip_src.s_addr, n_long);
 		nth = (struct tcphdr *)(ip + 1);
 	      }
 		if (th != nth) {
 			/*
 			 * this is usually a case when an extension header
 			 * exists between the IPv6 header and the
 			 * TCP header.
 			 */
 			nth->th_sport = th->th_sport;
 			nth->th_dport = th->th_dport;
 		}
 		xchg(nth->th_dport, nth->th_sport, n_short);
 #undef xchg
 	}
 #ifdef INET6
 	if (isipv6) {
 		ip6->ip6_flow = 0;
 		ip6->ip6_vfc = IPV6_VERSION;
 		ip6->ip6_nxt = IPPROTO_TCP;
 		ip6->ip6_plen = htons((u_short)(sizeof (struct tcphdr) +
 						tlen));
 		tlen += sizeof (struct ip6_hdr) + sizeof (struct tcphdr);
 	} else
 #endif
 	{
 		tlen += sizeof (struct tcpiphdr);
 		ip->ip_len = tlen;
 		ip->ip_ttl = V_ip_defttl;
 		if (V_path_mtu_discovery)
 			ip->ip_off |= IP_DF;
 	}
 	m->m_len = tlen;
 	m->m_pkthdr.len = tlen;
 	m->m_pkthdr.rcvif = NULL;
 #ifdef MAC
 	if (inp != NULL) {
 		/*
 		 * Packet is associated with a socket, so allow the
 		 * label of the response to reflect the socket label.
 		 */
 		INP_WLOCK_ASSERT(inp);
 		mac_inpcb_create_mbuf(inp, m);
 	} else {
 		/*
 		 * Packet is not associated with a socket, so possibly
 		 * update the label in place.
 		 */
 		mac_netinet_tcp_reply(m);
 	}
 #endif
 	nth->th_seq = htonl(seq);
 	nth->th_ack = htonl(ack);
 	nth->th_x2 = 0;
 	nth->th_off = sizeof (struct tcphdr) >> 2;
 	nth->th_flags = flags;
 	if (tp != NULL)
 		nth->th_win = htons((u_short) (win >> tp->rcv_scale));
 	else
 		nth->th_win = htons((u_short)win);
 	nth->th_urp = 0;
 #ifdef INET6
 	if (isipv6) {
 		nth->th_sum = 0;
 		nth->th_sum = in6_cksum(m, IPPROTO_TCP,
 					sizeof(struct ip6_hdr),
 					tlen - sizeof(struct ip6_hdr));
 		ip6->ip6_hlim = in6_selecthlim(tp != NULL ? tp->t_inpcb :
 		    NULL, NULL);
 	} else
 #endif /* INET6 */
 	{
 		nth->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
 		    htons((u_short)(tlen - sizeof(struct ip) + ip->ip_p)));
 		m->m_pkthdr.csum_flags = CSUM_TCP;
 		m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
 	}
 #ifdef TCPDEBUG
 	if (tp == NULL || (inp->inp_socket->so_options & SO_DEBUG))
 		tcp_trace(TA_OUTPUT, 0, tp, mtod(m, void *), th, 0);
 #endif
 #ifdef INET6
 	if (isipv6)
 		(void) ip6_output(m, NULL, NULL, ipflags, NULL, NULL, inp);
 	else
 #endif /* INET6 */
 	(void) ip_output(m, NULL, NULL, ipflags, NULL, inp);
 }
 
 /*
  * Create a new TCP control block, making an
  * empty reassembly queue and hooking it to the argument
  * protocol control block.  The `inp' parameter must have
  * come from the zone allocator set up in tcp_init().
  */
 struct tcpcb *
 tcp_newtcpcb(struct inpcb *inp)
 {
 	INIT_VNET_INET(inp->inp_vnet);
 	struct tcpcb_mem *tm;
 	struct tcpcb *tp;
 #ifdef INET6
 	int isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
 #endif /* INET6 */
 
 	tm = uma_zalloc(tcpcb_zone, M_NOWAIT | M_ZERO);
 	if (tm == NULL)
 		return (NULL);
 	tp = &tm->tcb;
 	tp->t_timers = &tm->tt;
 	/*	LIST_INIT(&tp->t_segq); */	/* XXX covered by M_ZERO */
 	tp->t_maxseg = tp->t_maxopd =
 #ifdef INET6
 		isipv6 ? V_tcp_v6mssdflt :
 #endif /* INET6 */
 		V_tcp_mssdflt;
 
 	/* Set up our timeouts. */
 	callout_init(&tp->t_timers->tt_rexmt, CALLOUT_MPSAFE);
 	callout_init(&tp->t_timers->tt_persist, CALLOUT_MPSAFE);
 	callout_init(&tp->t_timers->tt_keep, CALLOUT_MPSAFE);
 	callout_init(&tp->t_timers->tt_2msl, CALLOUT_MPSAFE);
 	callout_init(&tp->t_timers->tt_delack, CALLOUT_MPSAFE);
 
 	if (V_tcp_do_rfc1323)
 		tp->t_flags = (TF_REQ_SCALE|TF_REQ_TSTMP);
 	if (V_tcp_do_sack)
 		tp->t_flags |= TF_SACK_PERMIT;
 	TAILQ_INIT(&tp->snd_holes);
 	tp->t_inpcb = inp;	/* XXX */
 	/*
 	 * Init srtt to TCPTV_SRTTBASE (0), so we can tell that we have no
 	 * rtt estimate.  Set rttvar so that srtt + 4 * rttvar gives
 	 * reasonable initial retransmit time.
 	 */
 	tp->t_srtt = TCPTV_SRTTBASE;
 	tp->t_rttvar = ((TCPTV_RTOBASE - TCPTV_SRTTBASE) << TCP_RTTVAR_SHIFT) / 4;
 	tp->t_rttmin = tcp_rexmit_min;
 	tp->t_rxtcur = TCPTV_RTOBASE;
 	tp->snd_cwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT;
 	tp->snd_bwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT;
 	tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT;
 	tp->t_rcvtime = ticks;
 	tp->t_bw_rtttime = ticks;
 	/*
 	 * IPv4 TTL initialization is necessary for an IPv6 socket as well,
 	 * because the socket may be bound to an IPv6 wildcard address,
 	 * which may match an IPv4-mapped IPv6 address.
 	 */
 	inp->inp_ip_ttl = V_ip_defttl;
 	inp->inp_ppcb = tp;
 	return (tp);		/* XXX */
 }
 
 /*
  * Drop a TCP connection, reporting
  * the specified error.  If connection is synchronized,
  * then send a RST to peer.
  */
 struct tcpcb *
 tcp_drop(struct tcpcb *tp, int errno)
 {
 	INIT_VNET_INET(tp->t_inpcb->inp_vnet);
 	struct socket *so = tp->t_inpcb->inp_socket;
 
 	INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 
 	if (TCPS_HAVERCVDSYN(tp->t_state)) {
 		tp->t_state = TCPS_CLOSED;
 		(void) tcp_output_reset(tp);
 		V_tcpstat.tcps_drops++;
 	} else
 		V_tcpstat.tcps_conndrops++;
 	if (errno == ETIMEDOUT && tp->t_softerror)
 		errno = tp->t_softerror;
 	so->so_error = errno;
 	return (tcp_close(tp));
 }
 
 void
 tcp_discardcb(struct tcpcb *tp)
 {
 	INIT_VNET_INET(tp->t_vnet);
 	struct tseg_qent *q;
 	struct inpcb *inp = tp->t_inpcb;
 	struct socket *so = inp->inp_socket;
 #ifdef INET6
 	int isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
 #endif /* INET6 */
 
 	INP_WLOCK_ASSERT(inp);
 
 	/*
 	 * Make sure that all of our timers are stopped before we
 	 * delete the PCB.
 	 */
 	callout_stop(&tp->t_timers->tt_rexmt);
 	callout_stop(&tp->t_timers->tt_persist);
 	callout_stop(&tp->t_timers->tt_keep);
 	callout_stop(&tp->t_timers->tt_2msl);
 	callout_stop(&tp->t_timers->tt_delack);
 
 	/*
 	 * If we got enough samples through the srtt filter,
 	 * save the rtt and rttvar in the routing entry.
 	 * 'Enough' is arbitrarily defined as 4 rtt samples.
 	 * 4 samples is enough for the srtt filter to converge
 	 * to within enough % of the correct value; fewer samples
 	 * and we could save a bogus rtt. The danger is not high
 	 * as tcp quickly recovers from everything.
 	 * XXX: Works very well but needs some more statistics!
 	 */
 	if (tp->t_rttupdated >= 4) {
 		struct hc_metrics_lite metrics;
 		u_long ssthresh;
 
 		bzero(&metrics, sizeof(metrics));
 		/*
 		 * Update the ssthresh always when the conditions below
 		 * are satisfied. This gives us better new start value
 		 * for the congestion avoidance for new connections.
 		 * ssthresh is only set if packet loss occured on a session.
 		 *
 		 * XXXRW: 'so' may be NULL here, and/or socket buffer may be
 		 * being torn down.  Ideally this code would not use 'so'.
 		 */
 		ssthresh = tp->snd_ssthresh;
 		if (ssthresh != 0 && ssthresh < so->so_snd.sb_hiwat / 2) {
 			/*
 			 * convert the limit from user data bytes to
 			 * packets then to packet data bytes.
 			 */
 			ssthresh = (ssthresh + tp->t_maxseg / 2) / tp->t_maxseg;
 			if (ssthresh < 2)
 				ssthresh = 2;
 			ssthresh *= (u_long)(tp->t_maxseg +
 #ifdef INET6
 				      (isipv6 ? sizeof (struct ip6_hdr) +
 					       sizeof (struct tcphdr) :
 #endif
 				       sizeof (struct tcpiphdr)
 #ifdef INET6
 				       )
 #endif
 				      );
 		} else
 			ssthresh = 0;
 		metrics.rmx_ssthresh = ssthresh;
 
 		metrics.rmx_rtt = tp->t_srtt;
 		metrics.rmx_rttvar = tp->t_rttvar;
 		/* XXX: This wraps if the pipe is more than 4 Gbit per second */
 		metrics.rmx_bandwidth = tp->snd_bandwidth;
 		metrics.rmx_cwnd = tp->snd_cwnd;
 		metrics.rmx_sendpipe = 0;
 		metrics.rmx_recvpipe = 0;
 
 		tcp_hc_update(&inp->inp_inc, &metrics);
 	}
 
 	/* free the reassembly queue, if any */
 	while ((q = LIST_FIRST(&tp->t_segq)) != NULL) {
 		LIST_REMOVE(q, tqe_q);
 		m_freem(q->tqe_m);
 		uma_zfree(tcp_reass_zone, q);
 		tp->t_segqlen--;
 		V_tcp_reass_qsize--;
 	}
 	/* Disconnect offload device, if any. */
 	tcp_offload_detach(tp);
 		
 	tcp_free_sackholes(tp);
 	inp->inp_ppcb = NULL;
 	tp->t_inpcb = NULL;
 	uma_zfree(tcpcb_zone, tp);
 }
 
 /*
  * Attempt to close a TCP control block, marking it as dropped, and freeing
  * the socket if we hold the only reference.
  */
 struct tcpcb *
 tcp_close(struct tcpcb *tp)
 {
 	INIT_VNET_INET(tp->t_inpcb->inp_vnet);
 	struct inpcb *inp = tp->t_inpcb;
 	struct socket *so;
 
 	INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
 	INP_WLOCK_ASSERT(inp);
 
 	/* Notify any offload devices of listener close */
 	if (tp->t_state == TCPS_LISTEN)
 		tcp_offload_listen_close(tp);
 	in_pcbdrop(inp);
 	V_tcpstat.tcps_closed++;
 	KASSERT(inp->inp_socket != NULL, ("tcp_close: inp_socket NULL"));
 	so = inp->inp_socket;
 	soisdisconnected(so);
 	if (inp->inp_vflag & INP_SOCKREF) {
 		KASSERT(so->so_state & SS_PROTOREF,
 		    ("tcp_close: !SS_PROTOREF"));
 		inp->inp_vflag &= ~INP_SOCKREF;
 		INP_WUNLOCK(inp);
 		ACCEPT_LOCK();
 		SOCK_LOCK(so);
 		so->so_state &= ~SS_PROTOREF;
 		sofree(so);
 		return (NULL);
 	}
 	return (tp);
 }
 
 void
 tcp_drain(void)
 {
 	VNET_ITERATOR_DECL(vnet_iter);
 
 	if (!do_tcpdrain)
 		return;
 
 	VNET_LIST_RLOCK();
 	VNET_FOREACH(vnet_iter) {
 		CURVNET_SET(vnet_iter);
 		INIT_VNET_INET(vnet_iter);
 		struct inpcb *inpb;
 		struct tcpcb *tcpb;
 		struct tseg_qent *te;
 
 	/*
 	 * Walk the tcpbs, if existing, and flush the reassembly queue,
 	 * if there is one...
 	 * XXX: The "Net/3" implementation doesn't imply that the TCP
 	 *      reassembly queue should be flushed, but in a situation
 	 *	where we're really low on mbufs, this is potentially
 	 *	usefull.
 	 */
 		INP_INFO_RLOCK(&V_tcbinfo);
 		LIST_FOREACH(inpb, V_tcbinfo.ipi_listhead, inp_list) {
 			if (inpb->inp_vflag & INP_TIMEWAIT)
 				continue;
 			INP_WLOCK(inpb);
 			if ((tcpb = intotcpcb(inpb)) != NULL) {
 				while ((te = LIST_FIRST(&tcpb->t_segq))
 			            != NULL) {
 					LIST_REMOVE(te, tqe_q);
 					m_freem(te->tqe_m);
 					uma_zfree(tcp_reass_zone, te);
 					tcpb->t_segqlen--;
 					V_tcp_reass_qsize--;
 				}
 				tcp_clean_sackreport(tcpb);
 			}
 			INP_WUNLOCK(inpb);
 		}
 		INP_INFO_RUNLOCK(&V_tcbinfo);
 		CURVNET_RESTORE();
 	}
 	VNET_LIST_RUNLOCK();
 }
 
 /*
  * Notify a tcp user of an asynchronous error;
  * store error as soft error, but wake up user
  * (for now, won't do anything until can select for soft error).
  *
  * Do not wake up user since there currently is no mechanism for
  * reporting soft errors (yet - a kqueue filter may be added).
  */
 static struct inpcb *
 tcp_notify(struct inpcb *inp, int error)
 {
 	struct tcpcb *tp;
 #ifdef INVARIANTS
 	INIT_VNET_INET(inp->inp_vnet); /* V_tcbinfo WLOCK ASSERT */
 #endif
 
 	INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
 	INP_WLOCK_ASSERT(inp);
 
 	if ((inp->inp_vflag & INP_TIMEWAIT) ||
 	    (inp->inp_vflag & INP_DROPPED))
 		return (inp);
 
 	tp = intotcpcb(inp);
 	KASSERT(tp != NULL, ("tcp_notify: tp == NULL"));
 
 	/*
 	 * Ignore some errors if we are hooked up.
 	 * If connection hasn't completed, has retransmitted several times,
 	 * and receives a second error, give up now.  This is better
 	 * than waiting a long time to establish a connection that
 	 * can never complete.
 	 */
 	if (tp->t_state == TCPS_ESTABLISHED &&
 	    (error == EHOSTUNREACH || error == ENETUNREACH ||
 	     error == EHOSTDOWN)) {
 		return (inp);
 	} else if (tp->t_state < TCPS_ESTABLISHED && tp->t_rxtshift > 3 &&
 	    tp->t_softerror) {
 		tp = tcp_drop(tp, error);
 		if (tp != NULL)
 			return (inp);
 		else
 			return (NULL);
 	} else {
 		tp->t_softerror = error;
 		return (inp);
 	}
 #if 0
 	wakeup( &so->so_timeo);
 	sorwakeup(so);
 	sowwakeup(so);
 #endif
 }
 
 static int
 tcp_pcblist(SYSCTL_HANDLER_ARGS)
 {
 	INIT_VNET_INET(curvnet);
 	int error, i, m, n, pcb_count;
 	struct inpcb *inp, **inp_list;
 	inp_gen_t gencnt;
 	struct xinpgen xig;
 
 	/*
 	 * The process of preparing the TCB list is too time-consuming and
 	 * resource-intensive to repeat twice on every request.
 	 */
 	if (req->oldptr == NULL) {
 		m = syncache_pcbcount();
 		n = V_tcbinfo.ipi_count;
 		req->oldidx = 2 * (sizeof xig)
 			+ ((m + n) + n/8) * sizeof(struct xtcpcb);
 		return (0);
 	}
 
 	if (req->newptr != NULL)
 		return (EPERM);
 
 	/*
 	 * OK, now we're committed to doing something.
 	 */
 	INP_INFO_RLOCK(&V_tcbinfo);
 	gencnt = V_tcbinfo.ipi_gencnt;
 	n = V_tcbinfo.ipi_count;
 	INP_INFO_RUNLOCK(&V_tcbinfo);
 
 	m = syncache_pcbcount();
 
 	error = sysctl_wire_old_buffer(req, 2 * (sizeof xig)
 		+ (n + m) * sizeof(struct xtcpcb));
 	if (error != 0)
 		return (error);
 
 	xig.xig_len = sizeof xig;
 	xig.xig_count = n + m;
 	xig.xig_gen = gencnt;
 	xig.xig_sogen = so_gencnt;
 	error = SYSCTL_OUT(req, &xig, sizeof xig);
 	if (error)
 		return (error);
 
 	error = syncache_pcblist(req, m, &pcb_count);
 	if (error)
 		return (error);
 
 	inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK);
 	if (inp_list == NULL)
 		return (ENOMEM);
 
 	INP_INFO_RLOCK(&V_tcbinfo);
 	for (inp = LIST_FIRST(V_tcbinfo.ipi_listhead), i = 0;
 	    inp != NULL && i < n; inp = LIST_NEXT(inp, inp_list)) {
 		INP_RLOCK(inp);
 		if (inp->inp_gencnt <= gencnt) {
 			/*
 			 * XXX: This use of cr_cansee(), introduced with
 			 * TCP state changes, is not quite right, but for
 			 * now, better than nothing.
 			 */
 			if (inp->inp_vflag & INP_TIMEWAIT) {
 				if (intotw(inp) != NULL)
 					error = cr_cansee(req->td->td_ucred,
 					    intotw(inp)->tw_cred);
 				else
 					error = EINVAL;	/* Skip this inp. */
 			} else
 				error = cr_canseeinpcb(req->td->td_ucred, inp);
 			if (error == 0)
 				inp_list[i++] = inp;
 		}
 		INP_RUNLOCK(inp);
 	}
 	INP_INFO_RUNLOCK(&V_tcbinfo);
 	n = i;
 
 	error = 0;
 	for (i = 0; i < n; i++) {
 		inp = inp_list[i];
 		INP_RLOCK(inp);
 		if (inp->inp_gencnt <= gencnt) {
 			struct xtcpcb xt;
 			void *inp_ppcb;
 
 			bzero(&xt, sizeof(xt));
 			xt.xt_len = sizeof xt;
 			/* XXX should avoid extra copy */
 			bcopy(inp, &xt.xt_inp, sizeof *inp);
 			inp_ppcb = inp->inp_ppcb;
 			if (inp_ppcb == NULL)
 				bzero((char *) &xt.xt_tp, sizeof xt.xt_tp);
 			else if (inp->inp_vflag & INP_TIMEWAIT) {
 				bzero((char *) &xt.xt_tp, sizeof xt.xt_tp);
 				xt.xt_tp.t_state = TCPS_TIME_WAIT;
 			} else
 				bcopy(inp_ppcb, &xt.xt_tp, sizeof xt.xt_tp);
 			if (inp->inp_socket != NULL)
 				sotoxsocket(inp->inp_socket, &xt.xt_socket);
 			else {
 				bzero(&xt.xt_socket, sizeof xt.xt_socket);
 				xt.xt_socket.xso_protocol = IPPROTO_TCP;
 			}
 			xt.xt_inp.inp_gencnt = inp->inp_gencnt;
 			INP_RUNLOCK(inp);
 			error = SYSCTL_OUT(req, &xt, sizeof xt);
 		} else
 			INP_RUNLOCK(inp);
 	
 	}
 	if (!error) {
 		/*
 		 * Give the user an updated idea of our state.
 		 * If the generation differs from what we told
 		 * her before, she knows that something happened
 		 * while we were processing this request, and it
 		 * might be necessary to retry.
 		 */
 		INP_INFO_RLOCK(&V_tcbinfo);
 		xig.xig_gen = V_tcbinfo.ipi_gencnt;
 		xig.xig_sogen = so_gencnt;
 		xig.xig_count = V_tcbinfo.ipi_count + pcb_count;
 		INP_INFO_RUNLOCK(&V_tcbinfo);
 		error = SYSCTL_OUT(req, &xig, sizeof xig);
 	}
 	free(inp_list, M_TEMP);
 	return (error);
 }
 
 SYSCTL_PROC(_net_inet_tcp, TCPCTL_PCBLIST, pcblist, CTLFLAG_RD, 0, 0,
     tcp_pcblist, "S,xtcpcb", "List of active TCP connections");
 
 static int
 tcp_getcred(SYSCTL_HANDLER_ARGS)
 {
 	INIT_VNET_INET(curvnet);
 	struct xucred xuc;
 	struct sockaddr_in addrs[2];
 	struct inpcb *inp;
 	int error;
 
 	error = priv_check(req->td, PRIV_NETINET_GETCRED);
 	if (error)
 		return (error);
 	error = SYSCTL_IN(req, addrs, sizeof(addrs));
 	if (error)
 		return (error);
 	INP_INFO_RLOCK(&V_tcbinfo);
 	inp = in_pcblookup_hash(&V_tcbinfo, addrs[1].sin_addr,
 	    addrs[1].sin_port, addrs[0].sin_addr, addrs[0].sin_port, 0, NULL);
 	if (inp != NULL) {
 		INP_RLOCK(inp);
 		INP_INFO_RUNLOCK(&V_tcbinfo);
 		if (inp->inp_socket == NULL)
 			error = ENOENT;
 		if (error == 0)
 			error = cr_canseeinpcb(req->td->td_ucred, inp);
 		if (error == 0)
 			cru2x(inp->inp_cred, &xuc);
 		INP_RUNLOCK(inp);
 	} else {
 		INP_INFO_RUNLOCK(&V_tcbinfo);
 		error = ENOENT;
 	}
 	if (error == 0)
 		error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred));
 	return (error);
 }
 
 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, getcred,
     CTLTYPE_OPAQUE|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0,
     tcp_getcred, "S,xucred", "Get the xucred of a TCP connection");
 
 #ifdef INET6
 static int
 tcp6_getcred(SYSCTL_HANDLER_ARGS)
 {
 	INIT_VNET_INET(curvnet);
 	INIT_VNET_INET6(curvnet);
 	struct xucred xuc;
 	struct sockaddr_in6 addrs[2];
 	struct inpcb *inp;
 	int error, mapped = 0;
 
 	error = priv_check(req->td, PRIV_NETINET_GETCRED);
 	if (error)
 		return (error);
 	error = SYSCTL_IN(req, addrs, sizeof(addrs));
 	if (error)
 		return (error);
 	if ((error = sa6_embedscope(&addrs[0], V_ip6_use_defzone)) != 0 ||
 	    (error = sa6_embedscope(&addrs[1], V_ip6_use_defzone)) != 0) {
 		return (error);
 	}
 	if (IN6_IS_ADDR_V4MAPPED(&addrs[0].sin6_addr)) {
 		if (IN6_IS_ADDR_V4MAPPED(&addrs[1].sin6_addr))
 			mapped = 1;
 		else
 			return (EINVAL);
 	}
 
 	INP_INFO_RLOCK(&V_tcbinfo);
 	if (mapped == 1)
 		inp = in_pcblookup_hash(&V_tcbinfo,
 			*(struct in_addr *)&addrs[1].sin6_addr.s6_addr[12],
 			addrs[1].sin6_port,
 			*(struct in_addr *)&addrs[0].sin6_addr.s6_addr[12],
 			addrs[0].sin6_port,
 			0, NULL);
 	else
 		inp = in6_pcblookup_hash(&V_tcbinfo,
 			&addrs[1].sin6_addr, addrs[1].sin6_port,
 			&addrs[0].sin6_addr, addrs[0].sin6_port, 0, NULL);
 	if (inp != NULL) {
 		INP_RLOCK(inp);
 		INP_INFO_RUNLOCK(&V_tcbinfo);
 		if (inp->inp_socket == NULL)
 			error = ENOENT;
 		if (error == 0)
 			error = cr_canseeinpcb(req->td->td_ucred, inp);
 		if (error == 0)
 			cru2x(inp->inp_cred, &xuc);
 		INP_RUNLOCK(inp);
 	} else {
 		INP_INFO_RUNLOCK(&V_tcbinfo);
 		error = ENOENT;
 	}
 	if (error == 0)
 		error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred));
 	return (error);
 }
 
 SYSCTL_PROC(_net_inet6_tcp6, OID_AUTO, getcred,
     CTLTYPE_OPAQUE|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0,
     tcp6_getcred, "S,xucred", "Get the xucred of a TCP6 connection");
 #endif
 
 
 void
 tcp_ctlinput(int cmd, struct sockaddr *sa, void *vip)
 {
 	INIT_VNET_INET(curvnet);
 	struct ip *ip = vip;
 	struct tcphdr *th;
 	struct in_addr faddr;
 	struct inpcb *inp;
 	struct tcpcb *tp;
 	struct inpcb *(*notify)(struct inpcb *, int) = tcp_notify;
 	struct icmp *icp;
 	struct in_conninfo inc;
 	tcp_seq icmp_tcp_seq;
 	int mtu;
 
 	faddr = ((struct sockaddr_in *)sa)->sin_addr;
 	if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY)
 		return;
 
 	if (cmd == PRC_MSGSIZE)
 		notify = tcp_mtudisc;
 	else if (V_icmp_may_rst && (cmd == PRC_UNREACH_ADMIN_PROHIB ||
 		cmd == PRC_UNREACH_PORT || cmd == PRC_TIMXCEED_INTRANS) && ip)
 		notify = tcp_drop_syn_sent;
 	/*
 	 * Redirects don't need to be handled up here.
 	 */
 	else if (PRC_IS_REDIRECT(cmd))
 		return;
 	/*
 	 * Source quench is depreciated.
 	 */
 	else if (cmd == PRC_QUENCH)
 		return;
 	/*
 	 * Hostdead is ugly because it goes linearly through all PCBs.
 	 * XXX: We never get this from ICMP, otherwise it makes an
 	 * excellent DoS attack on machines with many connections.
 	 */
 	else if (cmd == PRC_HOSTDEAD)
 		ip = NULL;
 	else if ((unsigned)cmd >= PRC_NCMDS || inetctlerrmap[cmd] == 0)
 		return;
 	if (ip != NULL) {
 		icp = (struct icmp *)((caddr_t)ip
 				      - offsetof(struct icmp, icmp_ip));
 		th = (struct tcphdr *)((caddr_t)ip
 				       + (ip->ip_hl << 2));
 		INP_INFO_WLOCK(&V_tcbinfo);
 		inp = in_pcblookup_hash(&V_tcbinfo, faddr, th->th_dport,
 		    ip->ip_src, th->th_sport, 0, NULL);
 		if (inp != NULL)  {
 			INP_WLOCK(inp);
 			if (!(inp->inp_vflag & INP_TIMEWAIT) &&
 			    !(inp->inp_vflag & INP_DROPPED) &&
 			    !(inp->inp_socket == NULL)) {
 				icmp_tcp_seq = htonl(th->th_seq);
 				tp = intotcpcb(inp);
 				if (SEQ_GEQ(icmp_tcp_seq, tp->snd_una) &&
 				    SEQ_LT(icmp_tcp_seq, tp->snd_max)) {
 					if (cmd == PRC_MSGSIZE) {
 					    /*
 					     * MTU discovery:
 					     * If we got a needfrag set the MTU
 					     * in the route to the suggested new
 					     * value (if given) and then notify.
 					     */
 					    bzero(&inc, sizeof(inc));
-					    inc.inc_flags = 0;	/* IPv4 */
 					    inc.inc_faddr = faddr;
 					    inc.inc_fibnum =
 						inp->inp_inc.inc_fibnum;
 
 					    mtu = ntohs(icp->icmp_nextmtu);
 					    /*
 					     * If no alternative MTU was
 					     * proposed, try the next smaller
 					     * one.  ip->ip_len has already
 					     * been swapped in icmp_input().
 					     */
 					    if (!mtu)
 						mtu = ip_next_mtu(ip->ip_len,
 						 1);
 					    if (mtu < max(296, V_tcp_minmss
 						 + sizeof(struct tcpiphdr)))
 						mtu = 0;
 					    if (!mtu)
 						mtu = V_tcp_mssdflt
 						 + sizeof(struct tcpiphdr);
 					    /*
 					     * Only cache the the MTU if it
 					     * is smaller than the interface
 					     * or route MTU.  tcp_mtudisc()
 					     * will do right thing by itself.
 					     */
 					    if (mtu <= tcp_maxmtu(&inc, NULL))
 						tcp_hc_updatemtu(&inc, mtu);
 					}
 
 					inp = (*notify)(inp, inetctlerrmap[cmd]);
 				}
 			}
 			if (inp != NULL)
 				INP_WUNLOCK(inp);
 		} else {
+			bzero(&inc, sizeof(inc));
 			inc.inc_fport = th->th_dport;
 			inc.inc_lport = th->th_sport;
 			inc.inc_faddr = faddr;
 			inc.inc_laddr = ip->ip_src;
-#ifdef INET6
-			inc.inc_isipv6 = 0;
-#endif
 			syncache_unreach(&inc, th);
 		}
 		INP_INFO_WUNLOCK(&V_tcbinfo);
 	} else
 		in_pcbnotifyall(&V_tcbinfo, faddr, inetctlerrmap[cmd], notify);
 }
 
 #ifdef INET6
 void
 tcp6_ctlinput(int cmd, struct sockaddr *sa, void *d)
 {
 	INIT_VNET_INET(curvnet);
 	struct tcphdr th;
 	struct inpcb *(*notify)(struct inpcb *, int) = tcp_notify;
 	struct ip6_hdr *ip6;
 	struct mbuf *m;
 	struct ip6ctlparam *ip6cp = NULL;
 	const struct sockaddr_in6 *sa6_src = NULL;
 	int off;
 	struct tcp_portonly {
 		u_int16_t th_sport;
 		u_int16_t th_dport;
 	} *thp;
 
 	if (sa->sa_family != AF_INET6 ||
 	    sa->sa_len != sizeof(struct sockaddr_in6))
 		return;
 
 	if (cmd == PRC_MSGSIZE)
 		notify = tcp_mtudisc;
 	else if (!PRC_IS_REDIRECT(cmd) &&
 		 ((unsigned)cmd >= PRC_NCMDS || inet6ctlerrmap[cmd] == 0))
 		return;
 	/* Source quench is depreciated. */
 	else if (cmd == PRC_QUENCH)
 		return;
 
 	/* if the parameter is from icmp6, decode it. */
 	if (d != NULL) {
 		ip6cp = (struct ip6ctlparam *)d;
 		m = ip6cp->ip6c_m;
 		ip6 = ip6cp->ip6c_ip6;
 		off = ip6cp->ip6c_off;
 		sa6_src = ip6cp->ip6c_src;
 	} else {
 		m = NULL;
 		ip6 = NULL;
 		off = 0;	/* fool gcc */
 		sa6_src = &sa6_any;
 	}
 
 	if (ip6 != NULL) {
 		struct in_conninfo inc;
 		/*
 		 * XXX: We assume that when IPV6 is non NULL,
 		 * M and OFF are valid.
 		 */
 
 		/* check if we can safely examine src and dst ports */
 		if (m->m_pkthdr.len < off + sizeof(*thp))
 			return;
 
 		bzero(&th, sizeof(th));
 		m_copydata(m, off, sizeof(*thp), (caddr_t)&th);
 
 		in6_pcbnotify(&V_tcbinfo, sa, th.th_dport,
 		    (struct sockaddr *)ip6cp->ip6c_src,
 		    th.th_sport, cmd, NULL, notify);
 
+		bzero(&inc, sizeof(inc));
 		inc.inc_fport = th.th_dport;
 		inc.inc_lport = th.th_sport;
 		inc.inc6_faddr = ((struct sockaddr_in6 *)sa)->sin6_addr;
 		inc.inc6_laddr = ip6cp->ip6c_src->sin6_addr;
-		inc.inc_isipv6 = 1;
+		inc.inc_flags |= INC_ISIPV6;
 		INP_INFO_WLOCK(&V_tcbinfo);
 		syncache_unreach(&inc, &th);
 		INP_INFO_WUNLOCK(&V_tcbinfo);
 	} else
 		in6_pcbnotify(&V_tcbinfo, sa, 0, (const struct sockaddr *)sa6_src,
 			      0, cmd, NULL, notify);
 }
 #endif /* INET6 */
 
 
 /*
  * Following is where TCP initial sequence number generation occurs.
  *
  * There are two places where we must use initial sequence numbers:
  * 1.  In SYN-ACK packets.
  * 2.  In SYN packets.
  *
  * All ISNs for SYN-ACK packets are generated by the syncache.  See
  * tcp_syncache.c for details.
  *
  * The ISNs in SYN packets must be monotonic; TIME_WAIT recycling
  * depends on this property.  In addition, these ISNs should be
  * unguessable so as to prevent connection hijacking.  To satisfy
  * the requirements of this situation, the algorithm outlined in
  * RFC 1948 is used, with only small modifications.
  *
  * Implementation details:
  *
  * Time is based off the system timer, and is corrected so that it
  * increases by one megabyte per second.  This allows for proper
  * recycling on high speed LANs while still leaving over an hour
  * before rollover.
  *
  * As reading the *exact* system time is too expensive to be done
  * whenever setting up a TCP connection, we increment the time
  * offset in two ways.  First, a small random positive increment
  * is added to isn_offset for each connection that is set up.
  * Second, the function tcp_isn_tick fires once per clock tick
  * and increments isn_offset as necessary so that sequence numbers
  * are incremented at approximately ISN_BYTES_PER_SECOND.  The
  * random positive increments serve only to ensure that the same
  * exact sequence number is never sent out twice (as could otherwise
  * happen when a port is recycled in less than the system tick
  * interval.)
  *
  * net.inet.tcp.isn_reseed_interval controls the number of seconds
  * between seeding of isn_secret.  This is normally set to zero,
  * as reseeding should not be necessary.
  *
  * Locking of the global variables isn_secret, isn_last_reseed, isn_offset,
  * isn_offset_old, and isn_ctx is performed using the TCP pcbinfo lock.  In
  * general, this means holding an exclusive (write) lock.
  */
 
 #define ISN_BYTES_PER_SECOND 1048576
 #define ISN_STATIC_INCREMENT 4096
 #define ISN_RANDOM_INCREMENT (4096 - 1)
 
 #ifdef VIMAGE_GLOBALS
 static u_char isn_secret[32];
 static int isn_last_reseed;
 static u_int32_t isn_offset, isn_offset_old;
 #endif
 
 tcp_seq
 tcp_new_isn(struct tcpcb *tp)
 {
 	INIT_VNET_INET(tp->t_vnet);
 	MD5_CTX isn_ctx;
 	u_int32_t md5_buffer[4];
 	tcp_seq new_isn;
 
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 
 	ISN_LOCK();
 	/* Seed if this is the first use, reseed if requested. */
 	if ((V_isn_last_reseed == 0) || ((V_tcp_isn_reseed_interval > 0) &&
 	     (((u_int)V_isn_last_reseed + (u_int)V_tcp_isn_reseed_interval*hz)
 		< (u_int)ticks))) {
 		read_random(&V_isn_secret, sizeof(V_isn_secret));
 		V_isn_last_reseed = ticks;
 	}
 
 	/* Compute the md5 hash and return the ISN. */
 	MD5Init(&isn_ctx);
 	MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_fport, sizeof(u_short));
 	MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_lport, sizeof(u_short));
 #ifdef INET6
 	if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) {
 		MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->in6p_faddr,
 			  sizeof(struct in6_addr));
 		MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->in6p_laddr,
 			  sizeof(struct in6_addr));
 	} else
 #endif
 	{
 		MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_faddr,
 			  sizeof(struct in_addr));
 		MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_laddr,
 			  sizeof(struct in_addr));
 	}
 	MD5Update(&isn_ctx, (u_char *) &V_isn_secret, sizeof(V_isn_secret));
 	MD5Final((u_char *) &md5_buffer, &isn_ctx);
 	new_isn = (tcp_seq) md5_buffer[0];
 	V_isn_offset += ISN_STATIC_INCREMENT +
 		(arc4random() & ISN_RANDOM_INCREMENT);
 	new_isn += V_isn_offset;
 	ISN_UNLOCK();
 	return (new_isn);
 }
 
 /*
  * Increment the offset to the next ISN_BYTES_PER_SECOND / 100 boundary
  * to keep time flowing at a relatively constant rate.  If the random
  * increments have already pushed us past the projected offset, do nothing.
  */
 static void
 tcp_isn_tick(void *xtp)
 {
 	VNET_ITERATOR_DECL(vnet_iter);
 	u_int32_t projected_offset;
 
 	ISN_LOCK();
 	VNET_LIST_RLOCK();
 	VNET_FOREACH(vnet_iter) {
 		CURVNET_SET(vnet_iter); /* XXX appease INVARIANTS */
 		INIT_VNET_INET(curvnet);
 		projected_offset =
 		    V_isn_offset_old + ISN_BYTES_PER_SECOND / 100;
 
 		if (SEQ_GT(projected_offset, V_isn_offset))
 			V_isn_offset = projected_offset;
 
 		V_isn_offset_old = V_isn_offset;
 		CURVNET_RESTORE();
 	}
 	VNET_LIST_RUNLOCK();
 	callout_reset(&isn_callout, hz/100, tcp_isn_tick, NULL);
 	ISN_UNLOCK();
 }
 
 /*
  * When a specific ICMP unreachable message is received and the
  * connection state is SYN-SENT, drop the connection.  This behavior
  * is controlled by the icmp_may_rst sysctl.
  */
 struct inpcb *
 tcp_drop_syn_sent(struct inpcb *inp, int errno)
 {
 #ifdef INVARIANTS
 	INIT_VNET_INET(inp->inp_vnet);
 #endif
 	struct tcpcb *tp;
 
 	INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
 	INP_WLOCK_ASSERT(inp);
 
 	if ((inp->inp_vflag & INP_TIMEWAIT) ||
 	    (inp->inp_vflag & INP_DROPPED))
 		return (inp);
 
 	tp = intotcpcb(inp);
 	if (tp->t_state != TCPS_SYN_SENT)
 		return (inp);
 
 	tp = tcp_drop(tp, errno);
 	if (tp != NULL)
 		return (inp);
 	else
 		return (NULL);
 }
 
 /*
  * When `need fragmentation' ICMP is received, update our idea of the MSS
  * based on the new value in the route.  Also nudge TCP to send something,
  * since we know the packet we just sent was dropped.
  * This duplicates some code in the tcp_mss() function in tcp_input.c.
  */
 struct inpcb *
 tcp_mtudisc(struct inpcb *inp, int errno)
 {
 	INIT_VNET_INET(inp->inp_vnet);
 	struct tcpcb *tp;
 	struct socket *so;
 
 	INP_WLOCK_ASSERT(inp);
 	if ((inp->inp_vflag & INP_TIMEWAIT) ||
 	    (inp->inp_vflag & INP_DROPPED))
 		return (inp);
 
 	tp = intotcpcb(inp);
 	KASSERT(tp != NULL, ("tcp_mtudisc: tp == NULL"));
 
 	tcp_mss_update(tp, -1, NULL, NULL);
   
 	so = inp->inp_socket;
 	SOCKBUF_LOCK(&so->so_snd);
 	/* If the mss is larger than the socket buffer, decrease the mss. */
 	if (so->so_snd.sb_hiwat < tp->t_maxseg)
 		tp->t_maxseg = so->so_snd.sb_hiwat;
 	SOCKBUF_UNLOCK(&so->so_snd);
 
 	V_tcpstat.tcps_mturesent++;
 	tp->t_rtttime = 0;
 	tp->snd_nxt = tp->snd_una;
 	tcp_free_sackholes(tp);
 	tp->snd_recover = tp->snd_max;
 	if (tp->t_flags & TF_SACK_PERMIT)
 		EXIT_FASTRECOVERY(tp);
 	tcp_output_send(tp);
 	return (inp);
 }
 
 /*
  * Look-up the routing entry to the peer of this inpcb.  If no route
  * is found and it cannot be allocated, then return 0.  This routine
  * is called by TCP routines that access the rmx structure and by
  * tcp_mss_update to get the peer/interface MTU.
  */
 u_long
 tcp_maxmtu(struct in_conninfo *inc, int *flags)
 {
 	struct route sro;
 	struct sockaddr_in *dst;
 	struct ifnet *ifp;
 	u_long maxmtu = 0;
 
 	KASSERT(inc != NULL, ("tcp_maxmtu with NULL in_conninfo pointer"));
 
 	bzero(&sro, sizeof(sro));
 	if (inc->inc_faddr.s_addr != INADDR_ANY) {
 	        dst = (struct sockaddr_in *)&sro.ro_dst;
 		dst->sin_family = AF_INET;
 		dst->sin_len = sizeof(*dst);
 		dst->sin_addr = inc->inc_faddr;
 		in_rtalloc_ign(&sro, 0, inc->inc_fibnum);
 	}
 	if (sro.ro_rt != NULL) {
 		ifp = sro.ro_rt->rt_ifp;
 		if (sro.ro_rt->rt_rmx.rmx_mtu == 0)
 			maxmtu = ifp->if_mtu;
 		else
 			maxmtu = min(sro.ro_rt->rt_rmx.rmx_mtu, ifp->if_mtu);
 
 		/* Report additional interface capabilities. */
 		if (flags != NULL) {
 			if (ifp->if_capenable & IFCAP_TSO4 &&
 			    ifp->if_hwassist & CSUM_TSO)
 				*flags |= CSUM_TSO;
 		}
 		RTFREE(sro.ro_rt);
 	}
 	return (maxmtu);
 }
 
 #ifdef INET6
 u_long
 tcp_maxmtu6(struct in_conninfo *inc, int *flags)
 {
 	struct route_in6 sro6;
 	struct ifnet *ifp;
 	u_long maxmtu = 0;
 
 	KASSERT(inc != NULL, ("tcp_maxmtu6 with NULL in_conninfo pointer"));
 
 	bzero(&sro6, sizeof(sro6));
 	if (!IN6_IS_ADDR_UNSPECIFIED(&inc->inc6_faddr)) {
 		sro6.ro_dst.sin6_family = AF_INET6;
 		sro6.ro_dst.sin6_len = sizeof(struct sockaddr_in6);
 		sro6.ro_dst.sin6_addr = inc->inc6_faddr;
 		rtalloc_ign((struct route *)&sro6, 0);
 	}
 	if (sro6.ro_rt != NULL) {
 		ifp = sro6.ro_rt->rt_ifp;
 		if (sro6.ro_rt->rt_rmx.rmx_mtu == 0)
 			maxmtu = IN6_LINKMTU(sro6.ro_rt->rt_ifp);
 		else
 			maxmtu = min(sro6.ro_rt->rt_rmx.rmx_mtu,
 				     IN6_LINKMTU(sro6.ro_rt->rt_ifp));
 
 		/* Report additional interface capabilities. */
 		if (flags != NULL) {
 			if (ifp->if_capenable & IFCAP_TSO6 &&
 			    ifp->if_hwassist & CSUM_TSO)
 				*flags |= CSUM_TSO;
 		}
 		RTFREE(sro6.ro_rt);
 	}
 
 	return (maxmtu);
 }
 #endif /* INET6 */
 
 #ifdef IPSEC
 /* compute ESP/AH header size for TCP, including outer IP header. */
 size_t
 ipsec_hdrsiz_tcp(struct tcpcb *tp)
 {
 	struct inpcb *inp;
 	struct mbuf *m;
 	size_t hdrsiz;
 	struct ip *ip;
 #ifdef INET6
 	struct ip6_hdr *ip6;
 #endif
 	struct tcphdr *th;
 
 	if ((tp == NULL) || ((inp = tp->t_inpcb) == NULL))
 		return (0);
 	MGETHDR(m, M_DONTWAIT, MT_DATA);
 	if (!m)
 		return (0);
 
 #ifdef INET6
 	if ((inp->inp_vflag & INP_IPV6) != 0) {
 		ip6 = mtod(m, struct ip6_hdr *);
 		th = (struct tcphdr *)(ip6 + 1);
 		m->m_pkthdr.len = m->m_len =
 			sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
 		tcpip_fillheaders(inp, ip6, th);
 		hdrsiz = ipsec6_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp);
 	} else
 #endif /* INET6 */
 	{
 		ip = mtod(m, struct ip *);
 		th = (struct tcphdr *)(ip + 1);
 		m->m_pkthdr.len = m->m_len = sizeof(struct tcpiphdr);
 		tcpip_fillheaders(inp, ip, th);
 		hdrsiz = ipsec4_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp);
 	}
 
 	m_free(m);
 	return (hdrsiz);
 }
 #endif /* IPSEC */
 
 /*
  * TCP BANDWIDTH DELAY PRODUCT WINDOW LIMITING
  *
  * This code attempts to calculate the bandwidth-delay product as a
  * means of determining the optimal window size to maximize bandwidth,
  * minimize RTT, and avoid the over-allocation of buffers on interfaces and
  * routers.  This code also does a fairly good job keeping RTTs in check
  * across slow links like modems.  We implement an algorithm which is very
  * similar (but not meant to be) TCP/Vegas.  The code operates on the
  * transmitter side of a TCP connection and so only effects the transmit
  * side of the connection.
  *
  * BACKGROUND:  TCP makes no provision for the management of buffer space
  * at the end points or at the intermediate routers and switches.  A TCP
  * stream, whether using NewReno or not, will eventually buffer as
  * many packets as it is able and the only reason this typically works is
  * due to the fairly small default buffers made available for a connection
  * (typicaly 16K or 32K).  As machines use larger windows and/or window
  * scaling it is now fairly easy for even a single TCP connection to blow-out
  * all available buffer space not only on the local interface, but on
  * intermediate routers and switches as well.  NewReno makes a misguided
  * attempt to 'solve' this problem by waiting for an actual failure to occur,
  * then backing off, then steadily increasing the window again until another
  * failure occurs, ad-infinitum.  This results in terrible oscillation that
  * is only made worse as network loads increase and the idea of intentionally
  * blowing out network buffers is, frankly, a terrible way to manage network
  * resources.
  *
  * It is far better to limit the transmit window prior to the failure
  * condition being achieved.  There are two general ways to do this:  First
  * you can 'scan' through different transmit window sizes and locate the
  * point where the RTT stops increasing, indicating that you have filled the
  * pipe, then scan backwards until you note that RTT stops decreasing, then
  * repeat ad-infinitum.  This method works in principle but has severe
  * implementation issues due to RTT variances, timer granularity, and
  * instability in the algorithm which can lead to many false positives and
  * create oscillations as well as interact badly with other TCP streams
  * implementing the same algorithm.
  *
  * The second method is to limit the window to the bandwidth delay product
  * of the link.  This is the method we implement.  RTT variances and our
  * own manipulation of the congestion window, bwnd, can potentially
  * destabilize the algorithm.  For this reason we have to stabilize the
  * elements used to calculate the window.  We do this by using the minimum
  * observed RTT, the long term average of the observed bandwidth, and
  * by adding two segments worth of slop.  It isn't perfect but it is able
  * to react to changing conditions and gives us a very stable basis on
  * which to extend the algorithm.
  */
 void
 tcp_xmit_bandwidth_limit(struct tcpcb *tp, tcp_seq ack_seq)
 {
 	INIT_VNET_INET(tp->t_vnet);
 	u_long bw;
 	u_long bwnd;
 	int save_ticks;
 
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 
 	/*
 	 * If inflight_enable is disabled in the middle of a tcp connection,
 	 * make sure snd_bwnd is effectively disabled.
 	 */
 	if (V_tcp_inflight_enable == 0 ||
 	    tp->t_rttlow < V_tcp_inflight_rttthresh) {
 		tp->snd_bwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT;
 		tp->snd_bandwidth = 0;
 		return;
 	}
 
 	/*
 	 * Figure out the bandwidth.  Due to the tick granularity this
 	 * is a very rough number and it MUST be averaged over a fairly
 	 * long period of time.  XXX we need to take into account a link
 	 * that is not using all available bandwidth, but for now our
 	 * slop will ramp us up if this case occurs and the bandwidth later
 	 * increases.
 	 *
 	 * Note: if ticks rollover 'bw' may wind up negative.  We must
 	 * effectively reset t_bw_rtttime for this case.
 	 */
 	save_ticks = ticks;
 	if ((u_int)(save_ticks - tp->t_bw_rtttime) < 1)
 		return;
 
 	bw = (int64_t)(ack_seq - tp->t_bw_rtseq) * hz /
 	    (save_ticks - tp->t_bw_rtttime);
 	tp->t_bw_rtttime = save_ticks;
 	tp->t_bw_rtseq = ack_seq;
 	if (tp->t_bw_rtttime == 0 || (int)bw < 0)
 		return;
 	bw = ((int64_t)tp->snd_bandwidth * 15 + bw) >> 4;
 
 	tp->snd_bandwidth = bw;
 
 	/*
 	 * Calculate the semi-static bandwidth delay product, plus two maximal
 	 * segments.  The additional slop puts us squarely in the sweet
 	 * spot and also handles the bandwidth run-up case and stabilization.
 	 * Without the slop we could be locking ourselves into a lower
 	 * bandwidth.
 	 *
 	 * Situations Handled:
 	 *	(1) Prevents over-queueing of packets on LANs, especially on
 	 *	    high speed LANs, allowing larger TCP buffers to be
 	 *	    specified, and also does a good job preventing
 	 *	    over-queueing of packets over choke points like modems
 	 *	    (at least for the transmit side).
 	 *
 	 *	(2) Is able to handle changing network loads (bandwidth
 	 *	    drops so bwnd drops, bandwidth increases so bwnd
 	 *	    increases).
 	 *
 	 *	(3) Theoretically should stabilize in the face of multiple
 	 *	    connections implementing the same algorithm (this may need
 	 *	    a little work).
 	 *
 	 *	(4) Stability value (defaults to 20 = 2 maximal packets) can
 	 *	    be adjusted with a sysctl but typically only needs to be
 	 *	    on very slow connections.  A value no smaller then 5
 	 *	    should be used, but only reduce this default if you have
 	 *	    no other choice.
 	 */
 #define USERTT	((tp->t_srtt + tp->t_rttbest) / 2)
 	bwnd = (int64_t)bw * USERTT / (hz << TCP_RTT_SHIFT) + V_tcp_inflight_stab * tp->t_maxseg / 10;
 #undef USERTT
 
 	if (tcp_inflight_debug > 0) {
 		static int ltime;
 		if ((u_int)(ticks - ltime) >= hz / tcp_inflight_debug) {
 			ltime = ticks;
 			printf("%p bw %ld rttbest %d srtt %d bwnd %ld\n",
 			    tp,
 			    bw,
 			    tp->t_rttbest,
 			    tp->t_srtt,
 			    bwnd
 			);
 		}
 	}
 	if ((long)bwnd < V_tcp_inflight_min)
 		bwnd = V_tcp_inflight_min;
 	if (bwnd > V_tcp_inflight_max)
 		bwnd = V_tcp_inflight_max;
 	if ((long)bwnd < tp->t_maxseg * 2)
 		bwnd = tp->t_maxseg * 2;
 	tp->snd_bwnd = bwnd;
 }
 
 #ifdef TCP_SIGNATURE
 /*
  * Callback function invoked by m_apply() to digest TCP segment data
  * contained within an mbuf chain.
  */
 static int
 tcp_signature_apply(void *fstate, void *data, u_int len)
 {
 
 	MD5Update(fstate, (u_char *)data, len);
 	return (0);
 }
 
 /*
  * Compute TCP-MD5 hash of a TCP segment. (RFC2385)
  *
  * Parameters:
  * m		pointer to head of mbuf chain
  * _unused	
  * len		length of TCP segment data, excluding options
  * optlen	length of TCP segment options
  * buf		pointer to storage for computed MD5 digest
  * direction	direction of flow (IPSEC_DIR_INBOUND or OUTBOUND)
  *
  * We do this over ip, tcphdr, segment data, and the key in the SADB.
  * When called from tcp_input(), we can be sure that th_sum has been
  * zeroed out and verified already.
  *
  * Return 0 if successful, otherwise return -1.
  *
  * XXX The key is retrieved from the system's PF_KEY SADB, by keying a
  * search with the destination IP address, and a 'magic SPI' to be
  * determined by the application. This is hardcoded elsewhere to 1179
  * right now. Another branch of this code exists which uses the SPD to
  * specify per-application flows but it is unstable.
  */
 int
 tcp_signature_compute(struct mbuf *m, int _unused, int len, int optlen,
     u_char *buf, u_int direction)
 {
 	INIT_VNET_IPSEC(curvnet);
 	union sockaddr_union dst;
 	struct ippseudo ippseudo;
 	MD5_CTX ctx;
 	int doff;
 	struct ip *ip;
 	struct ipovly *ipovly;
 	struct secasvar *sav;
 	struct tcphdr *th;
 #ifdef INET6
 	struct ip6_hdr *ip6;
 	struct in6_addr in6;
 	char ip6buf[INET6_ADDRSTRLEN];
 	uint32_t plen;
 	uint16_t nhdr;
 #endif
 	u_short savecsum;
 
 	KASSERT(m != NULL, ("NULL mbuf chain"));
 	KASSERT(buf != NULL, ("NULL signature pointer"));
 
 	/* Extract the destination from the IP header in the mbuf. */
 	bzero(&dst, sizeof(union sockaddr_union));
 	ip = mtod(m, struct ip *);
 #ifdef INET6
 	ip6 = NULL;	/* Make the compiler happy. */
 #endif
 	switch (ip->ip_v) {
 	case IPVERSION:
 		dst.sa.sa_len = sizeof(struct sockaddr_in);
 		dst.sa.sa_family = AF_INET;
 		dst.sin.sin_addr = (direction == IPSEC_DIR_INBOUND) ?
 		    ip->ip_src : ip->ip_dst;
 		break;
 #ifdef INET6
 	case (IPV6_VERSION >> 4):
 		ip6 = mtod(m, struct ip6_hdr *);
 		dst.sa.sa_len = sizeof(struct sockaddr_in6);
 		dst.sa.sa_family = AF_INET6;
 		dst.sin6.sin6_addr = (direction == IPSEC_DIR_INBOUND) ?
 		    ip6->ip6_src : ip6->ip6_dst;
 		break;
 #endif
 	default:
 		return (EINVAL);
 		/* NOTREACHED */
 		break;
 	}
 
 	/* Look up an SADB entry which matches the address of the peer. */
 	sav = KEY_ALLOCSA(&dst, IPPROTO_TCP, htonl(TCP_SIG_SPI));
 	if (sav == NULL) {
 		ipseclog((LOG_ERR, "%s: SADB lookup failed for %s\n", __func__,
 		    (ip->ip_v == IPVERSION) ? inet_ntoa(dst.sin.sin_addr) :
 #ifdef INET6
 			(ip->ip_v == (IPV6_VERSION >> 4)) ?
 			    ip6_sprintf(ip6buf, &dst.sin6.sin6_addr) :
 #endif
 			"(unsupported)"));
 		return (EINVAL);
 	}
 
 	MD5Init(&ctx);
 	/*
 	 * Step 1: Update MD5 hash with IP(v6) pseudo-header.
 	 *
 	 * XXX The ippseudo header MUST be digested in network byte order,
 	 * or else we'll fail the regression test. Assume all fields we've
 	 * been doing arithmetic on have been in host byte order.
 	 * XXX One cannot depend on ipovly->ih_len here. When called from
 	 * tcp_output(), the underlying ip_len member has not yet been set.
 	 */
 	switch (ip->ip_v) {
 	case IPVERSION:
 		ipovly = (struct ipovly *)ip;
 		ippseudo.ippseudo_src = ipovly->ih_src;
 		ippseudo.ippseudo_dst = ipovly->ih_dst;
 		ippseudo.ippseudo_pad = 0;
 		ippseudo.ippseudo_p = IPPROTO_TCP;
 		ippseudo.ippseudo_len = htons(len + sizeof(struct tcphdr) +
 		    optlen);
 		MD5Update(&ctx, (char *)&ippseudo, sizeof(struct ippseudo));
 
 		th = (struct tcphdr *)((u_char *)ip + sizeof(struct ip));
 		doff = sizeof(struct ip) + sizeof(struct tcphdr) + optlen;
 		break;
 #ifdef INET6
 	/*
 	 * RFC 2385, 2.0  Proposal
 	 * For IPv6, the pseudo-header is as described in RFC 2460, namely the
 	 * 128-bit source IPv6 address, 128-bit destination IPv6 address, zero-
 	 * extended next header value (to form 32 bits), and 32-bit segment
 	 * length.
 	 * Note: Upper-Layer Packet Length comes before Next Header.
 	 */
 	case (IPV6_VERSION >> 4):
 		in6 = ip6->ip6_src;
 		in6_clearscope(&in6);
 		MD5Update(&ctx, (char *)&in6, sizeof(struct in6_addr));
 		in6 = ip6->ip6_dst;
 		in6_clearscope(&in6);
 		MD5Update(&ctx, (char *)&in6, sizeof(struct in6_addr));
 		plen = htonl(len + sizeof(struct tcphdr) + optlen);
 		MD5Update(&ctx, (char *)&plen, sizeof(uint32_t));
 		nhdr = 0;
 		MD5Update(&ctx, (char *)&nhdr, sizeof(uint8_t));
 		MD5Update(&ctx, (char *)&nhdr, sizeof(uint8_t));
 		MD5Update(&ctx, (char *)&nhdr, sizeof(uint8_t));
 		nhdr = IPPROTO_TCP;
 		MD5Update(&ctx, (char *)&nhdr, sizeof(uint8_t));
 
 		th = (struct tcphdr *)((u_char *)ip6 + sizeof(struct ip6_hdr));
 		doff = sizeof(struct ip6_hdr) + sizeof(struct tcphdr) + optlen;
 		break;
 #endif
 	default:
 		return (EINVAL);
 		/* NOTREACHED */
 		break;
 	}
 
 
 	/*
 	 * Step 2: Update MD5 hash with TCP header, excluding options.
 	 * The TCP checksum must be set to zero.
 	 */
 	savecsum = th->th_sum;
 	th->th_sum = 0;
 	MD5Update(&ctx, (char *)th, sizeof(struct tcphdr));
 	th->th_sum = savecsum;
 
 	/*
 	 * Step 3: Update MD5 hash with TCP segment data.
 	 *         Use m_apply() to avoid an early m_pullup().
 	 */
 	if (len > 0)
 		m_apply(m, doff, len, tcp_signature_apply, &ctx);
 
 	/*
 	 * Step 4: Update MD5 hash with shared secret.
 	 */
 	MD5Update(&ctx, sav->key_auth->key_data, _KEYLEN(sav->key_auth));
 	MD5Final(buf, &ctx);
 
 	key_sa_recordxfer(sav, m);
 	KEY_FREESAV(&sav);
 	return (0);
 }
 #endif /* TCP_SIGNATURE */
 
 static int
 sysctl_drop(SYSCTL_HANDLER_ARGS)
 {
 	INIT_VNET_INET(curvnet);
 #ifdef INET6
 	INIT_VNET_INET6(curvnet);
 #endif
 	/* addrs[0] is a foreign socket, addrs[1] is a local one. */
 	struct sockaddr_storage addrs[2];
 	struct inpcb *inp;
 	struct tcpcb *tp;
 	struct tcptw *tw;
 	struct sockaddr_in *fin, *lin;
 #ifdef INET6
 	struct sockaddr_in6 *fin6, *lin6;
 	struct in6_addr f6, l6;
 #endif
 	int error;
 
 	inp = NULL;
 	fin = lin = NULL;
 #ifdef INET6
 	fin6 = lin6 = NULL;
 #endif
 	error = 0;
 
 	if (req->oldptr != NULL || req->oldlen != 0)
 		return (EINVAL);
 	if (req->newptr == NULL)
 		return (EPERM);
 	if (req->newlen < sizeof(addrs))
 		return (ENOMEM);
 	error = SYSCTL_IN(req, &addrs, sizeof(addrs));
 	if (error)
 		return (error);
 
 	switch (addrs[0].ss_family) {
 #ifdef INET6
 	case AF_INET6:
 		fin6 = (struct sockaddr_in6 *)&addrs[0];
 		lin6 = (struct sockaddr_in6 *)&addrs[1];
 		if (fin6->sin6_len != sizeof(struct sockaddr_in6) ||
 		    lin6->sin6_len != sizeof(struct sockaddr_in6))
 			return (EINVAL);
 		if (IN6_IS_ADDR_V4MAPPED(&fin6->sin6_addr)) {
 			if (!IN6_IS_ADDR_V4MAPPED(&lin6->sin6_addr))
 				return (EINVAL);
 			in6_sin6_2_sin_in_sock((struct sockaddr *)&addrs[0]);
 			in6_sin6_2_sin_in_sock((struct sockaddr *)&addrs[1]);
 			fin = (struct sockaddr_in *)&addrs[0];
 			lin = (struct sockaddr_in *)&addrs[1];
 			break;
 		}
 		error = sa6_embedscope(fin6, V_ip6_use_defzone);
 		if (error)
 			return (error);
 		error = sa6_embedscope(lin6, V_ip6_use_defzone);
 		if (error)
 			return (error);
 		break;
 #endif
 	case AF_INET:
 		fin = (struct sockaddr_in *)&addrs[0];
 		lin = (struct sockaddr_in *)&addrs[1];
 		if (fin->sin_len != sizeof(struct sockaddr_in) ||
 		    lin->sin_len != sizeof(struct sockaddr_in))
 			return (EINVAL);
 		break;
 	default:
 		return (EINVAL);
 	}
 	INP_INFO_WLOCK(&V_tcbinfo);
 	switch (addrs[0].ss_family) {
 #ifdef INET6
 	case AF_INET6:
 		inp = in6_pcblookup_hash(&V_tcbinfo, &f6, fin6->sin6_port,
 		    &l6, lin6->sin6_port, 0, NULL);
 		break;
 #endif
 	case AF_INET:
 		inp = in_pcblookup_hash(&V_tcbinfo, fin->sin_addr,
 		    fin->sin_port, lin->sin_addr, lin->sin_port, 0, NULL);
 		break;
 	}
 	if (inp != NULL) {
 		INP_WLOCK(inp);
 		if (inp->inp_vflag & INP_TIMEWAIT) {
 			/*
 			 * XXXRW: There currently exists a state where an
 			 * inpcb is present, but its timewait state has been
 			 * discarded.  For now, don't allow dropping of this
 			 * type of inpcb.
 			 */
 			tw = intotw(inp);
 			if (tw != NULL)
 				tcp_twclose(tw, 0);
 			else
 				INP_WUNLOCK(inp);
 		} else if (!(inp->inp_vflag & INP_DROPPED) &&
 			   !(inp->inp_socket->so_options & SO_ACCEPTCONN)) {
 			tp = intotcpcb(inp);
 			tp = tcp_drop(tp, ECONNABORTED);
 			if (tp != NULL)
 				INP_WUNLOCK(inp);
 		} else
 			INP_WUNLOCK(inp);
 	} else
 		error = ESRCH;
 	INP_INFO_WUNLOCK(&V_tcbinfo);
 	return (error);
 }
 
 SYSCTL_PROC(_net_inet_tcp, TCPCTL_DROP, drop,
     CTLTYPE_STRUCT|CTLFLAG_WR|CTLFLAG_SKIP, NULL,
     0, sysctl_drop, "", "Drop TCP connection");
 
 /*
  * Generate a standardized TCP log line for use throughout the
  * tcp subsystem.  Memory allocation is done with M_NOWAIT to
  * allow use in the interrupt context.
  *
  * NB: The caller MUST free(s, M_TCPLOG) the returned string.
  * NB: The function may return NULL if memory allocation failed.
  *
  * Due to header inclusion and ordering limitations the struct ip
  * and ip6_hdr pointers have to be passed as void pointers.
  */
 char *
 tcp_log_addrs(struct in_conninfo *inc, struct tcphdr *th, void *ip4hdr,
     const void *ip6hdr)
 {
 	char *s, *sp;
 	size_t size;
 	struct ip *ip;
 #ifdef INET6
 	const struct ip6_hdr *ip6;
 
 	ip6 = (const struct ip6_hdr *)ip6hdr;
 #endif /* INET6 */
 	ip = (struct ip *)ip4hdr;
 
 	/*
 	 * The log line looks like this:
 	 * "TCP: [1.2.3.4]:50332 to [1.2.3.4]:80 tcpflags 0x2<SYN>"
 	 */
 	size = sizeof("TCP: []:12345 to []:12345 tcpflags 0x2<>") +
 	    sizeof(PRINT_TH_FLAGS) + 1 +
 #ifdef INET6
 	    2 * INET6_ADDRSTRLEN;
 #else
 	    2 * INET_ADDRSTRLEN;
 #endif /* INET6 */
 
 	/* Is logging enabled? */
 	if (tcp_log_debug == 0 && tcp_log_in_vain == 0)
 		return (NULL);
 
 	s = malloc(size, M_TCPLOG, M_ZERO|M_NOWAIT);
 	if (s == NULL)
 		return (NULL);
 
 	strcat(s, "TCP: [");
 	sp = s + strlen(s);
 
-	if (inc && inc->inc_isipv6 == 0) {
+	if (inc && ((inc->inc_flags & INC_ISIPV6) == 0)) {
 		inet_ntoa_r(inc->inc_faddr, sp);
 		sp = s + strlen(s);
 		sprintf(sp, "]:%i to [", ntohs(inc->inc_fport));
 		sp = s + strlen(s);
 		inet_ntoa_r(inc->inc_laddr, sp);
 		sp = s + strlen(s);
 		sprintf(sp, "]:%i", ntohs(inc->inc_lport));
 #ifdef INET6
 	} else if (inc) {
 		ip6_sprintf(sp, &inc->inc6_faddr);
 		sp = s + strlen(s);
 		sprintf(sp, "]:%i to [", ntohs(inc->inc_fport));
 		sp = s + strlen(s);
 		ip6_sprintf(sp, &inc->inc6_laddr);
 		sp = s + strlen(s);
 		sprintf(sp, "]:%i", ntohs(inc->inc_lport));
 	} else if (ip6 && th) {
 		ip6_sprintf(sp, &ip6->ip6_src);
 		sp = s + strlen(s);
 		sprintf(sp, "]:%i to [", ntohs(th->th_sport));
 		sp = s + strlen(s);
 		ip6_sprintf(sp, &ip6->ip6_dst);
 		sp = s + strlen(s);
 		sprintf(sp, "]:%i", ntohs(th->th_dport));
 #endif /* INET6 */
 	} else if (ip && th) {
 		inet_ntoa_r(ip->ip_src, sp);
 		sp = s + strlen(s);
 		sprintf(sp, "]:%i to [", ntohs(th->th_sport));
 		sp = s + strlen(s);
 		inet_ntoa_r(ip->ip_dst, sp);
 		sp = s + strlen(s);
 		sprintf(sp, "]:%i", ntohs(th->th_dport));
 	} else {
 		free(s, M_TCPLOG);
 		return (NULL);
 	}
 	sp = s + strlen(s);
 	if (th)
 		sprintf(sp, " tcpflags 0x%b", th->th_flags, PRINT_TH_FLAGS);
 	if (*(s + size - 1) != '\0')
 		panic("%s: string too long", __func__);
 	return (s);
 }
Index: head/sys/netinet/tcp_syncache.c
===================================================================
--- head/sys/netinet/tcp_syncache.c	(revision 186221)
+++ head/sys/netinet/tcp_syncache.c	(revision 186222)
@@ -1,1769 +1,1770 @@
 /*-
  * Copyright (c) 2001 McAfee, Inc.
  * Copyright (c) 2006 Andre Oppermann, Internet Business Solutions AG
  * All rights reserved.
  *
  * This software was developed for the FreeBSD Project by Jonathan Lemon
  * and McAfee Research, the Security Research Division of McAfee, Inc. under
  * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the
  * DARPA CHATS research program.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_ipsec.h"
 #include "opt_mac.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/sysctl.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/md5.h>
 #include <sys/proc.h>		/* for proc0 declaration */
 #include <sys/random.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/syslog.h>
 #include <sys/ucred.h>
 #include <sys/vimage.h>
 
 #include <vm/uma.h>
 
 #include <net/if.h>
 #include <net/route.h>
 
 #include <netinet/in.h>
 #include <netinet/in_systm.h>
 #include <netinet/ip.h>
 #include <netinet/in_var.h>
 #include <netinet/in_pcb.h>
 #include <netinet/ip_var.h>
 #include <netinet/ip_options.h>
 #ifdef INET6
 #include <netinet/ip6.h>
 #include <netinet/icmp6.h>
 #include <netinet6/nd6.h>
 #include <netinet6/ip6_var.h>
 #include <netinet6/in6_pcb.h>
 #endif
 #include <netinet/tcp.h>
 #include <netinet/tcp_fsm.h>
 #include <netinet/tcp_seq.h>
 #include <netinet/tcp_timer.h>
 #include <netinet/tcp_var.h>
 #include <netinet/tcp_syncache.h>
 #include <netinet/tcp_offload.h>
 #ifdef INET6
 #include <netinet6/tcp6_var.h>
 #endif
 #include <netinet/vinet.h>
 
 #ifdef IPSEC
 #include <netipsec/ipsec.h>
 #ifdef INET6
 #include <netipsec/ipsec6.h>
 #endif
 #include <netipsec/key.h>
 #endif /*IPSEC*/
 
 #include <machine/in_cksum.h>
 
 #include <security/mac/mac_framework.h>
 
 #ifdef VIMAGE_GLOBALS
 static struct tcp_syncache tcp_syncache;
 static int tcp_syncookies;
 static int tcp_syncookiesonly;
 int tcp_sc_rst_sock_fail;
 #endif
 
 SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, syncookies,
     CTLFLAG_RW, tcp_syncookies, 0,
     "Use TCP SYN cookies if the syncache overflows");
 
 SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, syncookies_only,
     CTLFLAG_RW, tcp_syncookiesonly, 0,
     "Use only TCP SYN cookies");
 
 #ifdef TCP_OFFLOAD_DISABLE
 #define TOEPCB_ISSET(sc) (0)
 #else
 #define TOEPCB_ISSET(sc) ((sc)->sc_toepcb != NULL)
 #endif
 
 static void	 syncache_drop(struct syncache *, struct syncache_head *);
 static void	 syncache_free(struct syncache *);
 static void	 syncache_insert(struct syncache *, struct syncache_head *);
 struct syncache *syncache_lookup(struct in_conninfo *, struct syncache_head **);
 static int	 syncache_respond(struct syncache *);
 static struct	 socket *syncache_socket(struct syncache *, struct socket *,
 		    struct mbuf *m);
 static void	 syncache_timeout(struct syncache *sc, struct syncache_head *sch,
 		    int docallout);
 static void	 syncache_timer(void *);
 static void	 syncookie_generate(struct syncache_head *, struct syncache *,
 		    u_int32_t *);
 static struct syncache
 		*syncookie_lookup(struct in_conninfo *, struct syncache_head *,
 		    struct syncache *, struct tcpopt *, struct tcphdr *,
 		    struct socket *);
 
 /*
  * Transmit the SYN,ACK fewer times than TCP_MAXRXTSHIFT specifies.
  * 3 retransmits corresponds to a timeout of 3 * (1 + 2 + 4 + 8) == 45 seconds,
  * the odds are that the user has given up attempting to connect by then.
  */
 #define SYNCACHE_MAXREXMTS		3
 
 /* Arbitrary values */
 #define TCP_SYNCACHE_HASHSIZE		512
 #define TCP_SYNCACHE_BUCKETLIMIT	30
 
 SYSCTL_NODE(_net_inet_tcp, OID_AUTO, syncache, CTLFLAG_RW, 0, "TCP SYN cache");
 
 SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_syncache, OID_AUTO,
     bucketlimit, CTLFLAG_RDTUN,
     tcp_syncache.bucket_limit, 0, "Per-bucket hash limit for syncache");
 
 SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_syncache, OID_AUTO,
     cachelimit, CTLFLAG_RDTUN,
     tcp_syncache.cache_limit, 0, "Overall entry limit for syncache");
 
 SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_syncache, OID_AUTO,
     count, CTLFLAG_RD,
     tcp_syncache.cache_count, 0, "Current number of entries in syncache");
 
 SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_syncache, OID_AUTO,
     hashsize, CTLFLAG_RDTUN,
     tcp_syncache.hashsize, 0, "Size of TCP syncache hashtable");
 
 SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_syncache, OID_AUTO,
     rexmtlimit, CTLFLAG_RW,
     tcp_syncache.rexmt_limit, 0, "Limit on SYN/ACK retransmissions");
 
 SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_syncache, OID_AUTO,
      rst_on_sock_fail, CTLFLAG_RW,
      tcp_sc_rst_sock_fail, 0, "Send reset on socket allocation failure");
 
 static MALLOC_DEFINE(M_SYNCACHE, "syncache", "TCP syncache");
 
 #define SYNCACHE_HASH(inc, mask)					\
 	((V_tcp_syncache.hash_secret ^					\
 	  (inc)->inc_faddr.s_addr ^					\
 	  ((inc)->inc_faddr.s_addr >> 16) ^				\
 	  (inc)->inc_fport ^ (inc)->inc_lport) & mask)
 
 #define SYNCACHE_HASH6(inc, mask)					\
 	((V_tcp_syncache.hash_secret ^					\
 	  (inc)->inc6_faddr.s6_addr32[0] ^				\
 	  (inc)->inc6_faddr.s6_addr32[3] ^				\
 	  (inc)->inc_fport ^ (inc)->inc_lport) & mask)
 
 #define ENDPTS_EQ(a, b) (						\
 	(a)->ie_fport == (b)->ie_fport &&				\
 	(a)->ie_lport == (b)->ie_lport &&				\
 	(a)->ie_faddr.s_addr == (b)->ie_faddr.s_addr &&			\
 	(a)->ie_laddr.s_addr == (b)->ie_laddr.s_addr			\
 )
 
 #define ENDPTS6_EQ(a, b) (memcmp(a, b, sizeof(*a)) == 0)
 
 #define	SCH_LOCK(sch)		mtx_lock(&(sch)->sch_mtx)
 #define	SCH_UNLOCK(sch)		mtx_unlock(&(sch)->sch_mtx)
 #define	SCH_LOCK_ASSERT(sch)	mtx_assert(&(sch)->sch_mtx, MA_OWNED)
 
 /*
  * Requires the syncache entry to be already removed from the bucket list.
  */
 static void
 syncache_free(struct syncache *sc)
 {
 	INIT_VNET_INET(curvnet);
 
 	if (sc->sc_ipopts)
 		(void) m_free(sc->sc_ipopts);
 	if (sc->sc_cred)
 		crfree(sc->sc_cred);
 #ifdef MAC
 	mac_syncache_destroy(&sc->sc_label);
 #endif
 
 	uma_zfree(V_tcp_syncache.zone, sc);
 }
 
 void
 syncache_init(void)
 {
 	INIT_VNET_INET(curvnet);
 	int i;
 
 	V_tcp_syncookies = 1;
 	V_tcp_syncookiesonly = 0;
 	V_tcp_sc_rst_sock_fail = 1;
 
 	V_tcp_syncache.cache_count = 0;
 	V_tcp_syncache.hashsize = TCP_SYNCACHE_HASHSIZE;
 	V_tcp_syncache.bucket_limit = TCP_SYNCACHE_BUCKETLIMIT;
 	V_tcp_syncache.rexmt_limit = SYNCACHE_MAXREXMTS;
 	V_tcp_syncache.hash_secret = arc4random();
 
 	TUNABLE_INT_FETCH("net.inet.tcp.syncache.hashsize",
 	    &V_tcp_syncache.hashsize);
 	TUNABLE_INT_FETCH("net.inet.tcp.syncache.bucketlimit",
 	    &V_tcp_syncache.bucket_limit);
 	if (!powerof2(V_tcp_syncache.hashsize) ||
 	    V_tcp_syncache.hashsize == 0) {
 		printf("WARNING: syncache hash size is not a power of 2.\n");
 		V_tcp_syncache.hashsize = TCP_SYNCACHE_HASHSIZE;
 	}
 	V_tcp_syncache.hashmask = V_tcp_syncache.hashsize - 1;
 
 	/* Set limits. */
 	V_tcp_syncache.cache_limit =
 	    V_tcp_syncache.hashsize * V_tcp_syncache.bucket_limit;
 	TUNABLE_INT_FETCH("net.inet.tcp.syncache.cachelimit",
 	    &V_tcp_syncache.cache_limit);
 
 	/* Allocate the hash table. */
 	V_tcp_syncache.hashbase = malloc(V_tcp_syncache.hashsize *
 	    sizeof(struct syncache_head), M_SYNCACHE, M_WAITOK | M_ZERO);
 
 	/* Initialize the hash buckets. */
 	for (i = 0; i < V_tcp_syncache.hashsize; i++) {
 		TAILQ_INIT(&V_tcp_syncache.hashbase[i].sch_bucket);
 		mtx_init(&V_tcp_syncache.hashbase[i].sch_mtx, "tcp_sc_head",
 			 NULL, MTX_DEF);
 		callout_init_mtx(&V_tcp_syncache.hashbase[i].sch_timer,
 			 &V_tcp_syncache.hashbase[i].sch_mtx, 0);
 		V_tcp_syncache.hashbase[i].sch_length = 0;
 	}
 
 	/* Create the syncache entry zone. */
 	V_tcp_syncache.zone = uma_zcreate("syncache", sizeof(struct syncache),
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 	uma_zone_set_max(V_tcp_syncache.zone, V_tcp_syncache.cache_limit);
 }
 
 /*
  * Inserts a syncache entry into the specified bucket row.
  * Locks and unlocks the syncache_head autonomously.
  */
 static void
 syncache_insert(struct syncache *sc, struct syncache_head *sch)
 {
 	INIT_VNET_INET(sch->sch_vnet);
 	struct syncache *sc2;
 
 	SCH_LOCK(sch);
 
 	/*
 	 * Make sure that we don't overflow the per-bucket limit.
 	 * If the bucket is full, toss the oldest element.
 	 */
 	if (sch->sch_length >= V_tcp_syncache.bucket_limit) {
 		KASSERT(!TAILQ_EMPTY(&sch->sch_bucket),
 			("sch->sch_length incorrect"));
 		sc2 = TAILQ_LAST(&sch->sch_bucket, sch_head);
 		syncache_drop(sc2, sch);
 		V_tcpstat.tcps_sc_bucketoverflow++;
 	}
 
 	/* Put it into the bucket. */
 	TAILQ_INSERT_HEAD(&sch->sch_bucket, sc, sc_hash);
 	sch->sch_length++;
 
 	/* Reinitialize the bucket row's timer. */
 	if (sch->sch_length == 1)
 		sch->sch_nextc = ticks + INT_MAX;
 	syncache_timeout(sc, sch, 1);
 
 	SCH_UNLOCK(sch);
 
 	V_tcp_syncache.cache_count++;
 	V_tcpstat.tcps_sc_added++;
 }
 
 /*
  * Remove and free entry from syncache bucket row.
  * Expects locked syncache head.
  */
 static void
 syncache_drop(struct syncache *sc, struct syncache_head *sch)
 {
 	INIT_VNET_INET(sch->sch_vnet);
 
 	SCH_LOCK_ASSERT(sch);
 
 	TAILQ_REMOVE(&sch->sch_bucket, sc, sc_hash);
 	sch->sch_length--;
 
 #ifndef TCP_OFFLOAD_DISABLE
 	if (sc->sc_tu)
 		sc->sc_tu->tu_syncache_event(TOE_SC_DROP, sc->sc_toepcb);
 #endif		    
 	syncache_free(sc);
 	V_tcp_syncache.cache_count--;
 }
 
 /*
  * Engage/reengage time on bucket row.
  */
 static void
 syncache_timeout(struct syncache *sc, struct syncache_head *sch, int docallout)
 {
 	sc->sc_rxttime = ticks +
 		TCPTV_RTOBASE * (tcp_backoff[sc->sc_rxmits]);
 	sc->sc_rxmits++;
 	if (TSTMP_LT(sc->sc_rxttime, sch->sch_nextc)) {
 		sch->sch_nextc = sc->sc_rxttime;
 		if (docallout)
 			callout_reset(&sch->sch_timer, sch->sch_nextc - ticks,
 			    syncache_timer, (void *)sch);
 	}
 }
 
 /*
  * Walk the timer queues, looking for SYN,ACKs that need to be retransmitted.
  * If we have retransmitted an entry the maximum number of times, expire it.
  * One separate timer for each bucket row.
  */
 static void
 syncache_timer(void *xsch)
 {
 	struct syncache_head *sch = (struct syncache_head *)xsch;
 	struct syncache *sc, *nsc;
 	int tick = ticks;
 	char *s;
 
 	CURVNET_SET(sch->sch_vnet);
 	INIT_VNET_INET(sch->sch_vnet);
 
 	/* NB: syncache_head has already been locked by the callout. */
 	SCH_LOCK_ASSERT(sch);
 
 	/*
 	 * In the following cycle we may remove some entries and/or
 	 * advance some timeouts, so re-initialize the bucket timer.
 	 */
 	sch->sch_nextc = tick + INT_MAX;
 
 	TAILQ_FOREACH_SAFE(sc, &sch->sch_bucket, sc_hash, nsc) {
 		/*
 		 * We do not check if the listen socket still exists
 		 * and accept the case where the listen socket may be
 		 * gone by the time we resend the SYN/ACK.  We do
 		 * not expect this to happens often. If it does,
 		 * then the RST will be sent by the time the remote
 		 * host does the SYN/ACK->ACK.
 		 */
 		if (TSTMP_GT(sc->sc_rxttime, tick)) {
 			if (TSTMP_LT(sc->sc_rxttime, sch->sch_nextc))
 				sch->sch_nextc = sc->sc_rxttime;
 			continue;
 		}
 		if (sc->sc_rxmits > V_tcp_syncache.rexmt_limit) {
 			if ((s = tcp_log_addrs(&sc->sc_inc, NULL, NULL, NULL))) {
 				log(LOG_DEBUG, "%s; %s: Retransmits exhausted, "
 				    "giving up and removing syncache entry\n",
 				    s, __func__);
 				free(s, M_TCPLOG);
 			}
 			syncache_drop(sc, sch);
 			V_tcpstat.tcps_sc_stale++;
 			continue;
 		}
 		if ((s = tcp_log_addrs(&sc->sc_inc, NULL, NULL, NULL))) {
 			log(LOG_DEBUG, "%s; %s: Response timeout, "
 			    "retransmitting (%u) SYN|ACK\n",
 			    s, __func__, sc->sc_rxmits);
 			free(s, M_TCPLOG);
 		}
 
 		(void) syncache_respond(sc);
 		V_tcpstat.tcps_sc_retransmitted++;
 		syncache_timeout(sc, sch, 0);
 	}
 	if (!TAILQ_EMPTY(&(sch)->sch_bucket))
 		callout_reset(&(sch)->sch_timer, (sch)->sch_nextc - tick,
 			syncache_timer, (void *)(sch));
 	CURVNET_RESTORE();
 }
 
 /*
  * Find an entry in the syncache.
  * Returns always with locked syncache_head plus a matching entry or NULL.
  */
 struct syncache *
 syncache_lookup(struct in_conninfo *inc, struct syncache_head **schp)
 {
 	INIT_VNET_INET(curvnet);
 	struct syncache *sc;
 	struct syncache_head *sch;
 
 #ifdef INET6
-	if (inc->inc_isipv6) {
+	if (inc->inc_flags & INC_ISIPV6) {
 		sch = &V_tcp_syncache.hashbase[
 		    SYNCACHE_HASH6(inc, V_tcp_syncache.hashmask)];
 		*schp = sch;
 
 		SCH_LOCK(sch);
 
 		/* Circle through bucket row to find matching entry. */
 		TAILQ_FOREACH(sc, &sch->sch_bucket, sc_hash) {
 			if (ENDPTS6_EQ(&inc->inc_ie, &sc->sc_inc.inc_ie))
 				return (sc);
 		}
 	} else
 #endif
 	{
 		sch = &V_tcp_syncache.hashbase[
 		    SYNCACHE_HASH(inc, V_tcp_syncache.hashmask)];
 		*schp = sch;
 
 		SCH_LOCK(sch);
 
 		/* Circle through bucket row to find matching entry. */
 		TAILQ_FOREACH(sc, &sch->sch_bucket, sc_hash) {
 #ifdef INET6
-			if (sc->sc_inc.inc_isipv6)
+			if (sc->sc_inc.inc_flags & INC_ISIPV6)
 				continue;
 #endif
 			if (ENDPTS_EQ(&inc->inc_ie, &sc->sc_inc.inc_ie))
 				return (sc);
 		}
 	}
 	SCH_LOCK_ASSERT(*schp);
 	return (NULL);			/* always returns with locked sch */
 }
 
 /*
  * This function is called when we get a RST for a
  * non-existent connection, so that we can see if the
  * connection is in the syn cache.  If it is, zap it.
  */
 void
 syncache_chkrst(struct in_conninfo *inc, struct tcphdr *th)
 {
 	INIT_VNET_INET(curvnet);
 	struct syncache *sc;
 	struct syncache_head *sch;
 	char *s = NULL;
 
 	sc = syncache_lookup(inc, &sch);	/* returns locked sch */
 	SCH_LOCK_ASSERT(sch);
 
 	/*
 	 * Any RST to our SYN|ACK must not carry ACK, SYN or FIN flags.
 	 * See RFC 793 page 65, section SEGMENT ARRIVES.
 	 */
 	if (th->th_flags & (TH_ACK|TH_SYN|TH_FIN)) {
 		if ((s = tcp_log_addrs(inc, th, NULL, NULL)))
 			log(LOG_DEBUG, "%s; %s: Spurious RST with ACK, SYN or "
 			    "FIN flag set, segment ignored\n", s, __func__);
 		V_tcpstat.tcps_badrst++;
 		goto done;
 	}
 
 	/*
 	 * No corresponding connection was found in syncache.
 	 * If syncookies are enabled and possibly exclusively
 	 * used, or we are under memory pressure, a valid RST
 	 * may not find a syncache entry.  In that case we're
 	 * done and no SYN|ACK retransmissions will happen.
 	 * Otherwise the the RST was misdirected or spoofed.
 	 */
 	if (sc == NULL) {
 		if ((s = tcp_log_addrs(inc, th, NULL, NULL)))
 			log(LOG_DEBUG, "%s; %s: Spurious RST without matching "
 			    "syncache entry (possibly syncookie only), "
 			    "segment ignored\n", s, __func__);
 		V_tcpstat.tcps_badrst++;
 		goto done;
 	}
 
 	/*
 	 * If the RST bit is set, check the sequence number to see
 	 * if this is a valid reset segment.
 	 * RFC 793 page 37:
 	 *   In all states except SYN-SENT, all reset (RST) segments
 	 *   are validated by checking their SEQ-fields.  A reset is
 	 *   valid if its sequence number is in the window.
 	 *
 	 *   The sequence number in the reset segment is normally an
 	 *   echo of our outgoing acknowlegement numbers, but some hosts
 	 *   send a reset with the sequence number at the rightmost edge
 	 *   of our receive window, and we have to handle this case.
 	 */
 	if (SEQ_GEQ(th->th_seq, sc->sc_irs) &&
 	    SEQ_LEQ(th->th_seq, sc->sc_irs + sc->sc_wnd)) {
 		syncache_drop(sc, sch);
 		if ((s = tcp_log_addrs(inc, th, NULL, NULL)))
 			log(LOG_DEBUG, "%s; %s: Our SYN|ACK was rejected, "
 			    "connection attempt aborted by remote endpoint\n",
 			    s, __func__);
 		V_tcpstat.tcps_sc_reset++;
 	} else {
 		if ((s = tcp_log_addrs(inc, th, NULL, NULL)))
 			log(LOG_DEBUG, "%s; %s: RST with invalid SEQ %u != "
 			    "IRS %u (+WND %u), segment ignored\n",
 			    s, __func__, th->th_seq, sc->sc_irs, sc->sc_wnd);
 		V_tcpstat.tcps_badrst++;
 	}
 
 done:
 	if (s != NULL)
 		free(s, M_TCPLOG);
 	SCH_UNLOCK(sch);
 }
 
 void
 syncache_badack(struct in_conninfo *inc)
 {
 	INIT_VNET_INET(curvnet);
 	struct syncache *sc;
 	struct syncache_head *sch;
 
 	sc = syncache_lookup(inc, &sch);	/* returns locked sch */
 	SCH_LOCK_ASSERT(sch);
 	if (sc != NULL) {
 		syncache_drop(sc, sch);
 		V_tcpstat.tcps_sc_badack++;
 	}
 	SCH_UNLOCK(sch);
 }
 
 void
 syncache_unreach(struct in_conninfo *inc, struct tcphdr *th)
 {
 	INIT_VNET_INET(curvnet);
 	struct syncache *sc;
 	struct syncache_head *sch;
 
 	sc = syncache_lookup(inc, &sch);	/* returns locked sch */
 	SCH_LOCK_ASSERT(sch);
 	if (sc == NULL)
 		goto done;
 
 	/* If the sequence number != sc_iss, then it's a bogus ICMP msg */
 	if (ntohl(th->th_seq) != sc->sc_iss)
 		goto done;
 
 	/*
 	 * If we've rertransmitted 3 times and this is our second error,
 	 * we remove the entry.  Otherwise, we allow it to continue on.
 	 * This prevents us from incorrectly nuking an entry during a
 	 * spurious network outage.
 	 *
 	 * See tcp_notify().
 	 */
 	if ((sc->sc_flags & SCF_UNREACH) == 0 || sc->sc_rxmits < 3 + 1) {
 		sc->sc_flags |= SCF_UNREACH;
 		goto done;
 	}
 	syncache_drop(sc, sch);
 	V_tcpstat.tcps_sc_unreach++;
 done:
 	SCH_UNLOCK(sch);
 }
 
 /*
  * Build a new TCP socket structure from a syncache entry.
  */
 static struct socket *
 syncache_socket(struct syncache *sc, struct socket *lso, struct mbuf *m)
 {
 	INIT_VNET_INET(lso->so_vnet);
 	struct inpcb *inp = NULL;
 	struct socket *so;
 	struct tcpcb *tp;
 	char *s;
 
 	INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
 
 	/*
 	 * Ok, create the full blown connection, and set things up
 	 * as they would have been set up if we had created the
 	 * connection when the SYN arrived.  If we can't create
 	 * the connection, abort it.
 	 */
 	so = sonewconn(lso, SS_ISCONNECTED);
 	if (so == NULL) {
 		/*
 		 * Drop the connection; we will either send a RST or
 		 * have the peer retransmit its SYN again after its
 		 * RTO and try again.
 		 */
 		V_tcpstat.tcps_listendrop++;
 		if ((s = tcp_log_addrs(&sc->sc_inc, NULL, NULL, NULL))) {
 			log(LOG_DEBUG, "%s; %s: Socket create failed "
 			    "due to limits or memory shortage\n",
 			    s, __func__);
 			free(s, M_TCPLOG);
 		}
 		goto abort2;
 	}
 #ifdef MAC
 	SOCK_LOCK(so);
 	mac_socketpeer_set_from_mbuf(m, so);
 	SOCK_UNLOCK(so);
 #endif
 
 	inp = sotoinpcb(so);
 	inp->inp_inc.inc_fibnum = sc->sc_inc.inc_fibnum;
 	so->so_fibnum = sc->sc_inc.inc_fibnum;
 	INP_WLOCK(inp);
 
 	/* Insert new socket into PCB hash list. */
-	inp->inp_inc.inc_isipv6 = sc->sc_inc.inc_isipv6;
+	inp->inp_inc.inc_flags = sc->sc_inc.inc_flags;
 #ifdef INET6
-	if (sc->sc_inc.inc_isipv6) {
+	if (sc->sc_inc.inc_flags & INC_ISIPV6) {
 		inp->in6p_laddr = sc->sc_inc.inc6_laddr;
 	} else {
 		inp->inp_vflag &= ~INP_IPV6;
 		inp->inp_vflag |= INP_IPV4;
 #endif
 		inp->inp_laddr = sc->sc_inc.inc_laddr;
 #ifdef INET6
 	}
 #endif
 	inp->inp_lport = sc->sc_inc.inc_lport;
 	if (in_pcbinshash(inp) != 0) {
 		/*
 		 * Undo the assignments above if we failed to
 		 * put the PCB on the hash lists.
 		 */
 #ifdef INET6
-		if (sc->sc_inc.inc_isipv6)
+		if (sc->sc_inc.inc_flags & INC_ISIPV6)
 			inp->in6p_laddr = in6addr_any;
 		else
 #endif
 			inp->inp_laddr.s_addr = INADDR_ANY;
 		inp->inp_lport = 0;
 		goto abort;
 	}
 #ifdef IPSEC
 	/* Copy old policy into new socket's. */
 	if (ipsec_copy_policy(sotoinpcb(lso)->inp_sp, inp->inp_sp))
 		printf("syncache_socket: could not copy policy\n");
 #endif
 #ifdef INET6
-	if (sc->sc_inc.inc_isipv6) {
+	if (sc->sc_inc.inc_flags & INC_ISIPV6) {
 		struct inpcb *oinp = sotoinpcb(lso);
 		struct in6_addr laddr6;
 		struct sockaddr_in6 sin6;
 		/*
 		 * Inherit socket options from the listening socket.
 		 * Note that in6p_inputopts are not (and should not be)
 		 * copied, since it stores previously received options and is
 		 * used to detect if each new option is different than the
 		 * previous one and hence should be passed to a user.
 		 * If we copied in6p_inputopts, a user would not be able to
 		 * receive options just after calling the accept system call.
 		 */
 		inp->inp_flags |= oinp->inp_flags & INP_CONTROLOPTS;
 		if (oinp->in6p_outputopts)
 			inp->in6p_outputopts =
 			    ip6_copypktopts(oinp->in6p_outputopts, M_NOWAIT);
 
 		sin6.sin6_family = AF_INET6;
 		sin6.sin6_len = sizeof(sin6);
 		sin6.sin6_addr = sc->sc_inc.inc6_faddr;
 		sin6.sin6_port = sc->sc_inc.inc_fport;
 		sin6.sin6_flowinfo = sin6.sin6_scope_id = 0;
 		laddr6 = inp->in6p_laddr;
 		if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr))
 			inp->in6p_laddr = sc->sc_inc.inc6_laddr;
 		if (in6_pcbconnect(inp, (struct sockaddr *)&sin6,
 		    thread0.td_ucred)) {
 			inp->in6p_laddr = laddr6;
 			goto abort;
 		}
 		/* Override flowlabel from in6_pcbconnect. */
 		inp->inp_flow &= ~IPV6_FLOWLABEL_MASK;
 		inp->inp_flow |= sc->sc_flowlabel;
 	} else
 #endif
 	{
 		struct in_addr laddr;
 		struct sockaddr_in sin;
 
 		inp->inp_options = (m) ? ip_srcroute(m) : NULL;
 		
 		if (inp->inp_options == NULL) {
 			inp->inp_options = sc->sc_ipopts;
 			sc->sc_ipopts = NULL;
 		}
 
 		sin.sin_family = AF_INET;
 		sin.sin_len = sizeof(sin);
 		sin.sin_addr = sc->sc_inc.inc_faddr;
 		sin.sin_port = sc->sc_inc.inc_fport;
 		bzero((caddr_t)sin.sin_zero, sizeof(sin.sin_zero));
 		laddr = inp->inp_laddr;
 		if (inp->inp_laddr.s_addr == INADDR_ANY)
 			inp->inp_laddr = sc->sc_inc.inc_laddr;
 		if (in_pcbconnect(inp, (struct sockaddr *)&sin,
 		    thread0.td_ucred)) {
 			inp->inp_laddr = laddr;
 			goto abort;
 		}
 	}
 	tp = intotcpcb(inp);
 	tp->t_state = TCPS_SYN_RECEIVED;
 	tp->iss = sc->sc_iss;
 	tp->irs = sc->sc_irs;
 	tcp_rcvseqinit(tp);
 	tcp_sendseqinit(tp);
 	tp->snd_wl1 = sc->sc_irs;
 	tp->snd_max = tp->iss + 1;
 	tp->snd_nxt = tp->iss + 1;
 	tp->rcv_up = sc->sc_irs + 1;
 	tp->rcv_wnd = sc->sc_wnd;
 	tp->rcv_adv += tp->rcv_wnd;
 	tp->last_ack_sent = tp->rcv_nxt;
 
 	tp->t_flags = sototcpcb(lso)->t_flags & (TF_NOPUSH|TF_NODELAY);
 	if (sc->sc_flags & SCF_NOOPT)
 		tp->t_flags |= TF_NOOPT;
 	else {
 		if (sc->sc_flags & SCF_WINSCALE) {
 			tp->t_flags |= TF_REQ_SCALE|TF_RCVD_SCALE;
 			tp->snd_scale = sc->sc_requested_s_scale;
 			tp->request_r_scale = sc->sc_requested_r_scale;
 		}
 		if (sc->sc_flags & SCF_TIMESTAMP) {
 			tp->t_flags |= TF_REQ_TSTMP|TF_RCVD_TSTMP;
 			tp->ts_recent = sc->sc_tsreflect;
 			tp->ts_recent_age = ticks;
 			tp->ts_offset = sc->sc_tsoff;
 		}
 #ifdef TCP_SIGNATURE
 		if (sc->sc_flags & SCF_SIGNATURE)
 			tp->t_flags |= TF_SIGNATURE;
 #endif
 		if (sc->sc_flags & SCF_SACK)
 			tp->t_flags |= TF_SACK_PERMIT;
 	}
 
 	if (sc->sc_flags & SCF_ECN)
 		tp->t_flags |= TF_ECN_PERMIT;
 
 	/*
 	 * Set up MSS and get cached values from tcp_hostcache.
 	 * This might overwrite some of the defaults we just set.
 	 */
 	tcp_mss(tp, sc->sc_peer_mss);
 
 	/*
 	 * If the SYN,ACK was retransmitted, reset cwnd to 1 segment.
 	 */
 	if (sc->sc_rxmits)
 		tp->snd_cwnd = tp->t_maxseg;
 	tcp_timer_activate(tp, TT_KEEP, tcp_keepinit);
 
 	INP_WUNLOCK(inp);
 
 	V_tcpstat.tcps_accepts++;
 	return (so);
 
 abort:
 	INP_WUNLOCK(inp);
 abort2:
 	if (so != NULL)
 		soabort(so);
 	return (NULL);
 }
 
 /*
  * This function gets called when we receive an ACK for a
  * socket in the LISTEN state.  We look up the connection
  * in the syncache, and if its there, we pull it out of
  * the cache and turn it into a full-blown connection in
  * the SYN-RECEIVED state.
  */
 int
 syncache_expand(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th,
     struct socket **lsop, struct mbuf *m)
 {
 	INIT_VNET_INET(curvnet);
 	struct syncache *sc;
 	struct syncache_head *sch;
 	struct syncache scs;
 	char *s;
 
 	/*
 	 * Global TCP locks are held because we manipulate the PCB lists
 	 * and create a new socket.
 	 */
 	INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
 	KASSERT((th->th_flags & (TH_RST|TH_ACK|TH_SYN)) == TH_ACK,
 	    ("%s: can handle only ACK", __func__));
 
 	sc = syncache_lookup(inc, &sch);	/* returns locked sch */
 	SCH_LOCK_ASSERT(sch);
 	if (sc == NULL) {
 		/*
 		 * There is no syncache entry, so see if this ACK is
 		 * a returning syncookie.  To do this, first:
 		 *  A. See if this socket has had a syncache entry dropped in
 		 *     the past.  We don't want to accept a bogus syncookie
 		 *     if we've never received a SYN.
 		 *  B. check that the syncookie is valid.  If it is, then
 		 *     cobble up a fake syncache entry, and return.
 		 */
 		if (!V_tcp_syncookies) {
 			SCH_UNLOCK(sch);
 			if ((s = tcp_log_addrs(inc, th, NULL, NULL)))
 				log(LOG_DEBUG, "%s; %s: Spurious ACK, "
 				    "segment rejected (syncookies disabled)\n",
 				    s, __func__);
 			goto failed;
 		}
 		bzero(&scs, sizeof(scs));
 		sc = syncookie_lookup(inc, sch, &scs, to, th, *lsop);
 		SCH_UNLOCK(sch);
 		if (sc == NULL) {
 			if ((s = tcp_log_addrs(inc, th, NULL, NULL)))
 				log(LOG_DEBUG, "%s; %s: Segment failed "
 				    "SYNCOOKIE authentication, segment rejected "
 				    "(probably spoofed)\n", s, __func__);
 			goto failed;
 		}
 	} else {
 		/* Pull out the entry to unlock the bucket row. */
 		TAILQ_REMOVE(&sch->sch_bucket, sc, sc_hash);
 		sch->sch_length--;
 		V_tcp_syncache.cache_count--;
 		SCH_UNLOCK(sch);
 	}
 
 	/*
 	 * Segment validation:
 	 * ACK must match our initial sequence number + 1 (the SYN|ACK).
 	 */
 	if (th->th_ack != sc->sc_iss + 1 && !TOEPCB_ISSET(sc)) {
 		if ((s = tcp_log_addrs(inc, th, NULL, NULL)))
 			log(LOG_DEBUG, "%s; %s: ACK %u != ISS+1 %u, segment "
 			    "rejected\n", s, __func__, th->th_ack, sc->sc_iss);
 		goto failed;
 	}
 
 	/*
 	 * The SEQ must fall in the window starting at the received
 	 * initial receive sequence number + 1 (the SYN).
 	 */
 	if ((SEQ_LEQ(th->th_seq, sc->sc_irs) ||
 	    SEQ_GT(th->th_seq, sc->sc_irs + sc->sc_wnd)) &&
 	    !TOEPCB_ISSET(sc)) {
 		if ((s = tcp_log_addrs(inc, th, NULL, NULL)))
 			log(LOG_DEBUG, "%s; %s: SEQ %u != IRS+1 %u, segment "
 			    "rejected\n", s, __func__, th->th_seq, sc->sc_irs);
 		goto failed;
 	}
 
 	if (!(sc->sc_flags & SCF_TIMESTAMP) && (to->to_flags & TOF_TS)) {
 		if ((s = tcp_log_addrs(inc, th, NULL, NULL)))
 			log(LOG_DEBUG, "%s; %s: Timestamp not expected, "
 			    "segment rejected\n", s, __func__);
 		goto failed;
 	}
 	/*
 	 * If timestamps were negotiated the reflected timestamp
 	 * must be equal to what we actually sent in the SYN|ACK.
 	 */
 	if ((to->to_flags & TOF_TS) && to->to_tsecr != sc->sc_ts &&
 	    !TOEPCB_ISSET(sc)) {
 		if ((s = tcp_log_addrs(inc, th, NULL, NULL)))
 			log(LOG_DEBUG, "%s; %s: TSECR %u != TS %u, "
 			    "segment rejected\n",
 			    s, __func__, to->to_tsecr, sc->sc_ts);
 		goto failed;
 	}
 
 	*lsop = syncache_socket(sc, *lsop, m);
 
 	if (*lsop == NULL)
 		V_tcpstat.tcps_sc_aborted++;
 	else
 		V_tcpstat.tcps_sc_completed++;
 
 /* how do we find the inp for the new socket? */
 	if (sc != &scs)
 		syncache_free(sc);
 	return (1);
 failed:
 	if (sc != NULL && sc != &scs)
 		syncache_free(sc);
 	if (s != NULL)
 		free(s, M_TCPLOG);
 	*lsop = NULL;
 	return (0);
 }
 
 int
 tcp_offload_syncache_expand(struct in_conninfo *inc, struct tcpopt *to,
     struct tcphdr *th, struct socket **lsop, struct mbuf *m)
 {
 	INIT_VNET_INET(curvnet);
 	int rc;
 	
 	INP_INFO_WLOCK(&V_tcbinfo);
 	rc = syncache_expand(inc, to, th, lsop, m);
 	INP_INFO_WUNLOCK(&V_tcbinfo);
 
 	return (rc);
 }
 
 /*
  * Given a LISTEN socket and an inbound SYN request, add
  * this to the syn cache, and send back a segment:
  *	<SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK>
  * to the source.
  *
  * IMPORTANT NOTE: We do _NOT_ ACK data that might accompany the SYN.
  * Doing so would require that we hold onto the data and deliver it
  * to the application.  However, if we are the target of a SYN-flood
  * DoS attack, an attacker could send data which would eventually
  * consume all available buffer space if it were ACKed.  By not ACKing
  * the data, we avoid this DoS scenario.
  */
 static void
 _syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th,
     struct inpcb *inp, struct socket **lsop, struct mbuf *m,
     struct toe_usrreqs *tu, void *toepcb)
 {
 	INIT_VNET_INET(inp->inp_vnet);
 	struct tcpcb *tp;
 	struct socket *so;
 	struct syncache *sc = NULL;
 	struct syncache_head *sch;
 	struct mbuf *ipopts = NULL;
 	u_int32_t flowtmp;
 	int win, sb_hiwat, ip_ttl, ip_tos, noopt;
 	char *s;
 #ifdef INET6
 	int autoflowlabel = 0;
 #endif
 #ifdef MAC
 	struct label *maclabel;
 #endif
 	struct syncache scs;
 	struct ucred *cred;
 
 	INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
 	INP_WLOCK_ASSERT(inp);			/* listen socket */
 	KASSERT((th->th_flags & (TH_RST|TH_ACK|TH_SYN)) == TH_SYN,
 	    ("%s: unexpected tcp flags", __func__));
 
 	/*
 	 * Combine all so/tp operations very early to drop the INP lock as
 	 * soon as possible.
 	 */
 	so = *lsop;
 	tp = sototcpcb(so);
 	cred = crhold(so->so_cred);
 
 #ifdef INET6
-	if (inc->inc_isipv6 &&
+	if ((inc->inc_flags & INC_ISIPV6) &&
 	    (inp->inp_flags & IN6P_AUTOFLOWLABEL))
 		autoflowlabel = 1;
 #endif
 	ip_ttl = inp->inp_ip_ttl;
 	ip_tos = inp->inp_ip_tos;
 	win = sbspace(&so->so_rcv);
 	sb_hiwat = so->so_rcv.sb_hiwat;
 	noopt = (tp->t_flags & TF_NOOPT);
 
 	/* By the time we drop the lock these should no longer be used. */
 	so = NULL;
 	tp = NULL;
 
 #ifdef MAC
 	if (mac_syncache_init(&maclabel) != 0) {
 		INP_WUNLOCK(inp);
 		INP_INFO_WUNLOCK(&V_tcbinfo);
 		goto done;
 	} else
 		mac_syncache_create(maclabel, inp);
 #endif
 	INP_WUNLOCK(inp);
 	INP_INFO_WUNLOCK(&V_tcbinfo);
 
 	/*
 	 * Remember the IP options, if any.
 	 */
 #ifdef INET6
-	if (!inc->inc_isipv6)
+	if (!(inc->inc_flags & INC_ISIPV6))
 #endif
 		ipopts = (m) ? ip_srcroute(m) : NULL;
 
 	/*
 	 * See if we already have an entry for this connection.
 	 * If we do, resend the SYN,ACK, and reset the retransmit timer.
 	 *
 	 * XXX: should the syncache be re-initialized with the contents
 	 * of the new SYN here (which may have different options?)
 	 *
 	 * XXX: We do not check the sequence number to see if this is a
 	 * real retransmit or a new connection attempt.  The question is
 	 * how to handle such a case; either ignore it as spoofed, or
 	 * drop the current entry and create a new one?
 	 */
 	sc = syncache_lookup(inc, &sch);	/* returns locked entry */
 	SCH_LOCK_ASSERT(sch);
 	if (sc != NULL) {
 #ifndef TCP_OFFLOAD_DISABLE
 		if (sc->sc_tu)
 			sc->sc_tu->tu_syncache_event(TOE_SC_ENTRY_PRESENT,
 			    sc->sc_toepcb);
 #endif		    
 		V_tcpstat.tcps_sc_dupsyn++;
 		if (ipopts) {
 			/*
 			 * If we were remembering a previous source route,
 			 * forget it and use the new one we've been given.
 			 */
 			if (sc->sc_ipopts)
 				(void) m_free(sc->sc_ipopts);
 			sc->sc_ipopts = ipopts;
 		}
 		/*
 		 * Update timestamp if present.
 		 */
 		if ((sc->sc_flags & SCF_TIMESTAMP) && (to->to_flags & TOF_TS))
 			sc->sc_tsreflect = to->to_tsval;
 		else
 			sc->sc_flags &= ~SCF_TIMESTAMP;
 #ifdef MAC
 		/*
 		 * Since we have already unconditionally allocated label
 		 * storage, free it up.  The syncache entry will already
 		 * have an initialized label we can use.
 		 */
 		mac_syncache_destroy(&maclabel);
 		KASSERT(sc->sc_label != NULL,
 		    ("%s: label not initialized", __func__));
 #endif
 		/* Retransmit SYN|ACK and reset retransmit count. */
 		if ((s = tcp_log_addrs(&sc->sc_inc, th, NULL, NULL))) {
 			log(LOG_DEBUG, "%s; %s: Received duplicate SYN, "
 			    "resetting timer and retransmitting SYN|ACK\n",
 			    s, __func__);
 			free(s, M_TCPLOG);
 		}
 		if (!TOEPCB_ISSET(sc) && syncache_respond(sc) == 0) {
 			sc->sc_rxmits = 0;
 			syncache_timeout(sc, sch, 1);
 			V_tcpstat.tcps_sndacks++;
 			V_tcpstat.tcps_sndtotal++;
 		}
 		SCH_UNLOCK(sch);
 		goto done;
 	}
 
 	sc = uma_zalloc(V_tcp_syncache.zone, M_NOWAIT | M_ZERO);
 	if (sc == NULL) {
 		/*
 		 * The zone allocator couldn't provide more entries.
 		 * Treat this as if the cache was full; drop the oldest
 		 * entry and insert the new one.
 		 */
 		V_tcpstat.tcps_sc_zonefail++;
 		if ((sc = TAILQ_LAST(&sch->sch_bucket, sch_head)) != NULL)
 			syncache_drop(sc, sch);
 		sc = uma_zalloc(V_tcp_syncache.zone, M_NOWAIT | M_ZERO);
 		if (sc == NULL) {
 			if (V_tcp_syncookies) {
 				bzero(&scs, sizeof(scs));
 				sc = &scs;
 			} else {
 				SCH_UNLOCK(sch);
 				if (ipopts)
 					(void) m_free(ipopts);
 				goto done;
 			}
 		}
 	}
 	
 	/*
 	 * Fill in the syncache values.
 	 */
 #ifdef MAC
 	sc->sc_label = maclabel;
 #endif
 	sc->sc_cred = cred;
 	cred = NULL;
 	sc->sc_ipopts = ipopts;
+	/* XXX-BZ this fib assignment is just useless. */
 	sc->sc_inc.inc_fibnum = inp->inp_inc.inc_fibnum;
 	bcopy(inc, &sc->sc_inc, sizeof(struct in_conninfo));
 #ifdef INET6
-	if (!inc->inc_isipv6)
+	if (!(inc->inc_flags & INC_ISIPV6))
 #endif
 	{
 		sc->sc_ip_tos = ip_tos;
 		sc->sc_ip_ttl = ip_ttl;
 	}
 #ifndef TCP_OFFLOAD_DISABLE	
 	sc->sc_tu = tu;
 	sc->sc_toepcb = toepcb;
 #endif
 	sc->sc_irs = th->th_seq;
 	sc->sc_iss = arc4random();
 	sc->sc_flags = 0;
 	sc->sc_flowlabel = 0;
 
 	/*
 	 * Initial receive window: clip sbspace to [0 .. TCP_MAXWIN].
 	 * win was derived from socket earlier in the function.
 	 */
 	win = imax(win, 0);
 	win = imin(win, TCP_MAXWIN);
 	sc->sc_wnd = win;
 
 	if (V_tcp_do_rfc1323) {
 		/*
 		 * A timestamp received in a SYN makes
 		 * it ok to send timestamp requests and replies.
 		 */
 		if (to->to_flags & TOF_TS) {
 			sc->sc_tsreflect = to->to_tsval;
 			sc->sc_ts = ticks;
 			sc->sc_flags |= SCF_TIMESTAMP;
 		}
 		if (to->to_flags & TOF_SCALE) {
 			int wscale = 0;
 
 			/*
 			 * Pick the smallest possible scaling factor that
 			 * will still allow us to scale up to sb_max, aka
 			 * kern.ipc.maxsockbuf.
 			 *
 			 * We do this because there are broken firewalls that
 			 * will corrupt the window scale option, leading to
 			 * the other endpoint believing that our advertised
 			 * window is unscaled.  At scale factors larger than
 			 * 5 the unscaled window will drop below 1500 bytes,
 			 * leading to serious problems when traversing these
 			 * broken firewalls.
 			 *
 			 * With the default maxsockbuf of 256K, a scale factor
 			 * of 3 will be chosen by this algorithm.  Those who
 			 * choose a larger maxsockbuf should watch out
 			 * for the compatiblity problems mentioned above.
 			 *
 			 * RFC1323: The Window field in a SYN (i.e., a <SYN>
 			 * or <SYN,ACK>) segment itself is never scaled.
 			 */
 			while (wscale < TCP_MAX_WINSHIFT &&
 			    (TCP_MAXWIN << wscale) < sb_max)
 				wscale++;
 			sc->sc_requested_r_scale = wscale;
 			sc->sc_requested_s_scale = to->to_wscale;
 			sc->sc_flags |= SCF_WINSCALE;
 		}
 	}
 #ifdef TCP_SIGNATURE
 	/*
 	 * If listening socket requested TCP digests, and received SYN
 	 * contains the option, flag this in the syncache so that
 	 * syncache_respond() will do the right thing with the SYN+ACK.
 	 * XXX: Currently we always record the option by default and will
 	 * attempt to use it in syncache_respond().
 	 */
 	if (to->to_flags & TOF_SIGNATURE)
 		sc->sc_flags |= SCF_SIGNATURE;
 #endif
 	if (to->to_flags & TOF_SACKPERM)
 		sc->sc_flags |= SCF_SACK;
 	if (to->to_flags & TOF_MSS)
 		sc->sc_peer_mss = to->to_mss;	/* peer mss may be zero */
 	if (noopt)
 		sc->sc_flags |= SCF_NOOPT;
 	if ((th->th_flags & (TH_ECE|TH_CWR)) && V_tcp_do_ecn)
 		sc->sc_flags |= SCF_ECN;
 
 	if (V_tcp_syncookies) {
 		syncookie_generate(sch, sc, &flowtmp);
 #ifdef INET6
 		if (autoflowlabel)
 			sc->sc_flowlabel = flowtmp;
 #endif
 	} else {
 #ifdef INET6
 		if (autoflowlabel)
 			sc->sc_flowlabel =
 			    (htonl(ip6_randomflowlabel()) & IPV6_FLOWLABEL_MASK);
 #endif
 	}
 	SCH_UNLOCK(sch);
 
 	/*
 	 * Do a standard 3-way handshake.
 	 */
 	if (TOEPCB_ISSET(sc) || syncache_respond(sc) == 0) {
 		if (V_tcp_syncookies && V_tcp_syncookiesonly && sc != &scs)
 			syncache_free(sc);
 		else if (sc != &scs)
 			syncache_insert(sc, sch);   /* locks and unlocks sch */
 		V_tcpstat.tcps_sndacks++;
 		V_tcpstat.tcps_sndtotal++;
 	} else {
 		if (sc != &scs)
 			syncache_free(sc);
 		V_tcpstat.tcps_sc_dropped++;
 	}
 
 done:
 	if (cred != NULL)
 		crfree(cred);
 #ifdef MAC
 	if (sc == &scs)
 		mac_syncache_destroy(&maclabel);
 #endif
 	if (m) {
 		
 		*lsop = NULL;
 		m_freem(m);
 	}
 }
 
 static int
 syncache_respond(struct syncache *sc)
 {
 	INIT_VNET_INET(curvnet);
 	struct ip *ip = NULL;
 	struct mbuf *m;
 	struct tcphdr *th;
 	int optlen, error;
 	u_int16_t hlen, tlen, mssopt;
 	struct tcpopt to;
 #ifdef INET6
 	struct ip6_hdr *ip6 = NULL;
 #endif
 
 	hlen =
 #ifdef INET6
-	       (sc->sc_inc.inc_isipv6) ? sizeof(struct ip6_hdr) :
+	       (sc->sc_inc.inc_flags & INC_ISIPV6) ? sizeof(struct ip6_hdr) :
 #endif
 		sizeof(struct ip);
 	tlen = hlen + sizeof(struct tcphdr);
 
 	/* Determine MSS we advertize to other end of connection. */
 	mssopt = tcp_mssopt(&sc->sc_inc);
 	if (sc->sc_peer_mss)
 		mssopt = max( min(sc->sc_peer_mss, mssopt), V_tcp_minmss);
 
 	/* XXX: Assume that the entire packet will fit in a header mbuf. */
 	KASSERT(max_linkhdr + tlen + TCP_MAXOLEN <= MHLEN,
 	    ("syncache: mbuf too small"));
 
 	/* Create the IP+TCP header from scratch. */
 	m = m_gethdr(M_DONTWAIT, MT_DATA);
 	if (m == NULL)
 		return (ENOBUFS);
 #ifdef MAC
 	mac_syncache_create_mbuf(sc->sc_label, m);
 #endif
 	m->m_data += max_linkhdr;
 	m->m_len = tlen;
 	m->m_pkthdr.len = tlen;
 	m->m_pkthdr.rcvif = NULL;
 
 #ifdef INET6
-	if (sc->sc_inc.inc_isipv6) {
+	if (sc->sc_inc.inc_flags & INC_ISIPV6) {
 		ip6 = mtod(m, struct ip6_hdr *);
 		ip6->ip6_vfc = IPV6_VERSION;
 		ip6->ip6_nxt = IPPROTO_TCP;
 		ip6->ip6_src = sc->sc_inc.inc6_laddr;
 		ip6->ip6_dst = sc->sc_inc.inc6_faddr;
 		ip6->ip6_plen = htons(tlen - hlen);
 		/* ip6_hlim is set after checksum */
 		ip6->ip6_flow &= ~IPV6_FLOWLABEL_MASK;
 		ip6->ip6_flow |= sc->sc_flowlabel;
 
 		th = (struct tcphdr *)(ip6 + 1);
 	} else
 #endif
 	{
 		ip = mtod(m, struct ip *);
 		ip->ip_v = IPVERSION;
 		ip->ip_hl = sizeof(struct ip) >> 2;
 		ip->ip_len = tlen;
 		ip->ip_id = 0;
 		ip->ip_off = 0;
 		ip->ip_sum = 0;
 		ip->ip_p = IPPROTO_TCP;
 		ip->ip_src = sc->sc_inc.inc_laddr;
 		ip->ip_dst = sc->sc_inc.inc_faddr;
 		ip->ip_ttl = sc->sc_ip_ttl;
 		ip->ip_tos = sc->sc_ip_tos;
 
 		/*
 		 * See if we should do MTU discovery.  Route lookups are
 		 * expensive, so we will only unset the DF bit if:
 		 *
 		 *	1) path_mtu_discovery is disabled
 		 *	2) the SCF_UNREACH flag has been set
 		 */
 		if (V_path_mtu_discovery && ((sc->sc_flags & SCF_UNREACH) == 0))
 		       ip->ip_off |= IP_DF;
 
 		th = (struct tcphdr *)(ip + 1);
 	}
 	th->th_sport = sc->sc_inc.inc_lport;
 	th->th_dport = sc->sc_inc.inc_fport;
 
 	th->th_seq = htonl(sc->sc_iss);
 	th->th_ack = htonl(sc->sc_irs + 1);
 	th->th_off = sizeof(struct tcphdr) >> 2;
 	th->th_x2 = 0;
 	th->th_flags = TH_SYN|TH_ACK;
 	th->th_win = htons(sc->sc_wnd);
 	th->th_urp = 0;
 
 	if (sc->sc_flags & SCF_ECN) {
 		th->th_flags |= TH_ECE;
 		V_tcpstat.tcps_ecn_shs++;
 	}
 
 	/* Tack on the TCP options. */
 	if ((sc->sc_flags & SCF_NOOPT) == 0) {
 		to.to_flags = 0;
 
 		to.to_mss = mssopt;
 		to.to_flags = TOF_MSS;
 		if (sc->sc_flags & SCF_WINSCALE) {
 			to.to_wscale = sc->sc_requested_r_scale;
 			to.to_flags |= TOF_SCALE;
 		}
 		if (sc->sc_flags & SCF_TIMESTAMP) {
 			/* Virgin timestamp or TCP cookie enhanced one. */
 			to.to_tsval = sc->sc_ts;
 			to.to_tsecr = sc->sc_tsreflect;
 			to.to_flags |= TOF_TS;
 		}
 		if (sc->sc_flags & SCF_SACK)
 			to.to_flags |= TOF_SACKPERM;
 #ifdef TCP_SIGNATURE
 		if (sc->sc_flags & SCF_SIGNATURE)
 			to.to_flags |= TOF_SIGNATURE;
 #endif
 		optlen = tcp_addoptions(&to, (u_char *)(th + 1));
 
 		/* Adjust headers by option size. */
 		th->th_off = (sizeof(struct tcphdr) + optlen) >> 2;
 		m->m_len += optlen;
 		m->m_pkthdr.len += optlen;
 
 #ifdef TCP_SIGNATURE
 		if (sc->sc_flags & SCF_SIGNATURE)
 			tcp_signature_compute(m, 0, 0, optlen,
 			    to.to_signature, IPSEC_DIR_OUTBOUND);
 #endif
 #ifdef INET6
-		if (sc->sc_inc.inc_isipv6)
+		if (sc->sc_inc.inc_flags & INC_ISIPV6)
 			ip6->ip6_plen = htons(ntohs(ip6->ip6_plen) + optlen);
 		else
 #endif
 			ip->ip_len += optlen;
 	} else
 		optlen = 0;
 
 #ifdef INET6
-	if (sc->sc_inc.inc_isipv6) {
+	if (sc->sc_inc.inc_flags & INC_ISIPV6) {
 		th->th_sum = 0;
 		th->th_sum = in6_cksum(m, IPPROTO_TCP, hlen,
 				       tlen + optlen - hlen);
 		ip6->ip6_hlim = in6_selecthlim(NULL, NULL);
 		error = ip6_output(m, NULL, NULL, 0, NULL, NULL, NULL);
 	} else
 #endif
 	{
 		th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
 		    htons(tlen + optlen - hlen + IPPROTO_TCP));
 		m->m_pkthdr.csum_flags = CSUM_TCP;
 		m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
 		error = ip_output(m, sc->sc_ipopts, NULL, 0, NULL, NULL);
 	}
 	return (error);
 }
 
 void
 syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th,
     struct inpcb *inp, struct socket **lsop, struct mbuf *m)
 {
 	_syncache_add(inc, to, th, inp, lsop, m, NULL, NULL);
 }
 
 void
 tcp_offload_syncache_add(struct in_conninfo *inc, struct tcpopt *to,
     struct tcphdr *th, struct inpcb *inp, struct socket **lsop,
     struct toe_usrreqs *tu, void *toepcb)
 {
 	INIT_VNET_INET(curvnet);
 
 	INP_INFO_WLOCK(&V_tcbinfo);
 	INP_WLOCK(inp);
 	_syncache_add(inc, to, th, inp, lsop, NULL, tu, toepcb);
 }
 
 /*
  * The purpose of SYN cookies is to avoid keeping track of all SYN's we
  * receive and to be able to handle SYN floods from bogus source addresses
  * (where we will never receive any reply).  SYN floods try to exhaust all
  * our memory and available slots in the SYN cache table to cause a denial
  * of service to legitimate users of the local host.
  *
  * The idea of SYN cookies is to encode and include all necessary information
  * about the connection setup state within the SYN-ACK we send back and thus
  * to get along without keeping any local state until the ACK to the SYN-ACK
  * arrives (if ever).  Everything we need to know should be available from
  * the information we encoded in the SYN-ACK.
  *
  * More information about the theory behind SYN cookies and its first
  * discussion and specification can be found at:
  *  http://cr.yp.to/syncookies.html    (overview)
  *  http://cr.yp.to/syncookies/archive (gory details)
  *
  * This implementation extends the orginal idea and first implementation
  * of FreeBSD by using not only the initial sequence number field to store
  * information but also the timestamp field if present.  This way we can
  * keep track of the entire state we need to know to recreate the session in
  * its original form.  Almost all TCP speakers implement RFC1323 timestamps
  * these days.  For those that do not we still have to live with the known
  * shortcomings of the ISN only SYN cookies.
  *
  * Cookie layers:
  *
  * Initial sequence number we send:
  * 31|................................|0
  *    DDDDDDDDDDDDDDDDDDDDDDDDDMMMRRRP
  *    D = MD5 Digest (first dword)
  *    M = MSS index
  *    R = Rotation of secret
  *    P = Odd or Even secret
  *
  * The MD5 Digest is computed with over following parameters:
  *  a) randomly rotated secret
  *  b) struct in_conninfo containing the remote/local ip/port (IPv4&IPv6)
  *  c) the received initial sequence number from remote host
  *  d) the rotation offset and odd/even bit
  *
  * Timestamp we send:
  * 31|................................|0
  *    DDDDDDDDDDDDDDDDDDDDDDSSSSRRRRA5
  *    D = MD5 Digest (third dword) (only as filler)
  *    S = Requested send window scale
  *    R = Requested receive window scale
  *    A = SACK allowed
  *    5 = TCP-MD5 enabled (not implemented yet)
  *    XORed with MD5 Digest (forth dword)
  *
  * The timestamp isn't cryptographically secure and doesn't need to be.
  * The double use of the MD5 digest dwords ties it to a specific remote/
  * local host/port, remote initial sequence number and our local time
  * limited secret.  A received timestamp is reverted (XORed) and then
  * the contained MD5 dword is compared to the computed one to ensure the
  * timestamp belongs to the SYN-ACK we sent.  The other parameters may
  * have been tampered with but this isn't different from supplying bogus
  * values in the SYN in the first place.
  *
  * Some problems with SYN cookies remain however:
  * Consider the problem of a recreated (and retransmitted) cookie.  If the
  * original SYN was accepted, the connection is established.  The second
  * SYN is inflight, and if it arrives with an ISN that falls within the
  * receive window, the connection is killed.
  *
  * Notes:
  * A heuristic to determine when to accept syn cookies is not necessary.
  * An ACK flood would cause the syncookie verification to be attempted,
  * but a SYN flood causes syncookies to be generated.  Both are of equal
  * cost, so there's no point in trying to optimize the ACK flood case.
  * Also, if you don't process certain ACKs for some reason, then all someone
  * would have to do is launch a SYN and ACK flood at the same time, which
  * would stop cookie verification and defeat the entire purpose of syncookies.
  */
 static int tcp_sc_msstab[] = { 0, 256, 468, 536, 996, 1452, 1460, 8960 };
 
 static void
 syncookie_generate(struct syncache_head *sch, struct syncache *sc,
     u_int32_t *flowlabel)
 {
 	INIT_VNET_INET(curvnet);
 	MD5_CTX ctx;
 	u_int32_t md5_buffer[MD5_DIGEST_LENGTH / sizeof(u_int32_t)];
 	u_int32_t data;
 	u_int32_t *secbits;
 	u_int off, pmss, mss;
 	int i;
 
 	SCH_LOCK_ASSERT(sch);
 
 	/* Which of the two secrets to use. */
 	secbits = sch->sch_oddeven ?
 			sch->sch_secbits_odd : sch->sch_secbits_even;
 
 	/* Reseed secret if too old. */
 	if (sch->sch_reseed < time_uptime) {
 		sch->sch_oddeven = sch->sch_oddeven ? 0 : 1;	/* toggle */
 		secbits = sch->sch_oddeven ?
 				sch->sch_secbits_odd : sch->sch_secbits_even;
 		for (i = 0; i < SYNCOOKIE_SECRET_SIZE; i++)
 			secbits[i] = arc4random();
 		sch->sch_reseed = time_uptime + SYNCOOKIE_LIFETIME;
 	}
 
 	/* Secret rotation offset. */
 	off = sc->sc_iss & 0x7;			/* iss was randomized before */
 
 	/* Maximum segment size calculation. */
 	pmss =
 	    max( min(sc->sc_peer_mss, tcp_mssopt(&sc->sc_inc)),	V_tcp_minmss);
 	for (mss = sizeof(tcp_sc_msstab) / sizeof(int) - 1; mss > 0; mss--)
 		if (tcp_sc_msstab[mss] <= pmss)
 			break;
 
 	/* Fold parameters and MD5 digest into the ISN we will send. */
 	data = sch->sch_oddeven;/* odd or even secret, 1 bit */
 	data |= off << 1;	/* secret offset, derived from iss, 3 bits */
 	data |= mss << 4;	/* mss, 3 bits */
 
 	MD5Init(&ctx);
 	MD5Update(&ctx, ((u_int8_t *)secbits) + off,
 	    SYNCOOKIE_SECRET_SIZE * sizeof(*secbits) - off);
 	MD5Update(&ctx, secbits, off);
 	MD5Update(&ctx, &sc->sc_inc, sizeof(sc->sc_inc));
 	MD5Update(&ctx, &sc->sc_irs, sizeof(sc->sc_irs));
 	MD5Update(&ctx, &data, sizeof(data));
 	MD5Final((u_int8_t *)&md5_buffer, &ctx);
 
 	data |= (md5_buffer[0] << 7);
 	sc->sc_iss = data;
 
 #ifdef INET6
 	*flowlabel = md5_buffer[1] & IPV6_FLOWLABEL_MASK;
 #endif
 
 	/* Additional parameters are stored in the timestamp if present. */
 	if (sc->sc_flags & SCF_TIMESTAMP) {
 		data =  ((sc->sc_flags & SCF_SIGNATURE) ? 1 : 0); /* TCP-MD5, 1 bit */
 		data |= ((sc->sc_flags & SCF_SACK) ? 1 : 0) << 1; /* SACK, 1 bit */
 		data |= sc->sc_requested_s_scale << 2;  /* SWIN scale, 4 bits */
 		data |= sc->sc_requested_r_scale << 6;  /* RWIN scale, 4 bits */
 		data |= md5_buffer[2] << 10;		/* more digest bits */
 		data ^= md5_buffer[3];
 		sc->sc_ts = data;
 		sc->sc_tsoff = data - ticks;		/* after XOR */
 	}
 
 	V_tcpstat.tcps_sc_sendcookie++;
 }
 
 static struct syncache *
 syncookie_lookup(struct in_conninfo *inc, struct syncache_head *sch, 
     struct syncache *sc, struct tcpopt *to, struct tcphdr *th,
     struct socket *so)
 {
 	INIT_VNET_INET(curvnet);
 	MD5_CTX ctx;
 	u_int32_t md5_buffer[MD5_DIGEST_LENGTH / sizeof(u_int32_t)];
 	u_int32_t data = 0;
 	u_int32_t *secbits;
 	tcp_seq ack, seq;
 	int off, mss, wnd, flags;
 
 	SCH_LOCK_ASSERT(sch);
 
 	/*
 	 * Pull information out of SYN-ACK/ACK and
 	 * revert sequence number advances.
 	 */
 	ack = th->th_ack - 1;
 	seq = th->th_seq - 1;
 	off = (ack >> 1) & 0x7;
 	mss = (ack >> 4) & 0x7;
 	flags = ack & 0x7f;
 
 	/* Which of the two secrets to use. */
 	secbits = (flags & 0x1) ? sch->sch_secbits_odd : sch->sch_secbits_even;
 
 	/*
 	 * The secret wasn't updated for the lifetime of a syncookie,
 	 * so this SYN-ACK/ACK is either too old (replay) or totally bogus.
 	 */
 	if (sch->sch_reseed + SYNCOOKIE_LIFETIME < time_uptime) {
 		return (NULL);
 	}
 
 	/* Recompute the digest so we can compare it. */
 	MD5Init(&ctx);
 	MD5Update(&ctx, ((u_int8_t *)secbits) + off,
 	    SYNCOOKIE_SECRET_SIZE * sizeof(*secbits) - off);
 	MD5Update(&ctx, secbits, off);
 	MD5Update(&ctx, inc, sizeof(*inc));
 	MD5Update(&ctx, &seq, sizeof(seq));
 	MD5Update(&ctx, &flags, sizeof(flags));
 	MD5Final((u_int8_t *)&md5_buffer, &ctx);
 
 	/* Does the digest part of or ACK'ed ISS match? */
 	if ((ack & (~0x7f)) != (md5_buffer[0] << 7))
 		return (NULL);
 
 	/* Does the digest part of our reflected timestamp match? */
 	if (to->to_flags & TOF_TS) {
 		data = md5_buffer[3] ^ to->to_tsecr;
 		if ((data & (~0x3ff)) != (md5_buffer[2] << 10))
 			return (NULL);
 	}
 
 	/* Fill in the syncache values. */
 	bcopy(inc, &sc->sc_inc, sizeof(struct in_conninfo));
 	sc->sc_ipopts = NULL;
 	
 	sc->sc_irs = seq;
 	sc->sc_iss = ack;
 
 #ifdef INET6
-	if (inc->inc_isipv6) {
+	if (inc->inc_flags & INC_ISIPV6) {
 		if (sotoinpcb(so)->inp_flags & IN6P_AUTOFLOWLABEL)
 			sc->sc_flowlabel = md5_buffer[1] & IPV6_FLOWLABEL_MASK;
 	} else
 #endif
 	{
 		sc->sc_ip_ttl = sotoinpcb(so)->inp_ip_ttl;
 		sc->sc_ip_tos = sotoinpcb(so)->inp_ip_tos;
 	}
 
 	/* Additional parameters that were encoded in the timestamp. */
 	if (data) {
 		sc->sc_flags |= SCF_TIMESTAMP;
 		sc->sc_tsreflect = to->to_tsval;
 		sc->sc_ts = to->to_tsecr;
 		sc->sc_tsoff = to->to_tsecr - ticks;
 		sc->sc_flags |= (data & 0x1) ? SCF_SIGNATURE : 0;
 		sc->sc_flags |= ((data >> 1) & 0x1) ? SCF_SACK : 0;
 		sc->sc_requested_s_scale = min((data >> 2) & 0xf,
 		    TCP_MAX_WINSHIFT);
 		sc->sc_requested_r_scale = min((data >> 6) & 0xf,
 		    TCP_MAX_WINSHIFT);
 		if (sc->sc_requested_s_scale || sc->sc_requested_r_scale)
 			sc->sc_flags |= SCF_WINSCALE;
 	} else
 		sc->sc_flags |= SCF_NOOPT;
 
 	wnd = sbspace(&so->so_rcv);
 	wnd = imax(wnd, 0);
 	wnd = imin(wnd, TCP_MAXWIN);
 	sc->sc_wnd = wnd;
 
 	sc->sc_rxmits = 0;
 	sc->sc_peer_mss = tcp_sc_msstab[mss];
 
 	V_tcpstat.tcps_sc_recvcookie++;
 	return (sc);
 }
 
 /*
  * Returns the current number of syncache entries.  This number
  * will probably change before you get around to calling 
  * syncache_pcblist.
  */
 
 int
 syncache_pcbcount(void)
 {
 	INIT_VNET_INET(curvnet);
 	struct syncache_head *sch;
 	int count, i;
 
 	for (count = 0, i = 0; i < V_tcp_syncache.hashsize; i++) {
 		/* No need to lock for a read. */
 		sch = &V_tcp_syncache.hashbase[i];
 		count += sch->sch_length;
 	}
 	return count;
 }
 
 /*
  * Exports the syncache entries to userland so that netstat can display
  * them alongside the other sockets.  This function is intended to be
  * called only from tcp_pcblist.
  *
  * Due to concurrency on an active system, the number of pcbs exported
  * may have no relation to max_pcbs.  max_pcbs merely indicates the
  * amount of space the caller allocated for this function to use.
  */
 int
 syncache_pcblist(struct sysctl_req *req, int max_pcbs, int *pcbs_exported)
 {
 	INIT_VNET_INET(curvnet);
 	struct xtcpcb xt;
 	struct syncache *sc;
 	struct syncache_head *sch;
 	int count, error, i;
 
 	for (count = 0, error = 0, i = 0; i < V_tcp_syncache.hashsize; i++) {
 		sch = &V_tcp_syncache.hashbase[i];
 		SCH_LOCK(sch);
 		TAILQ_FOREACH(sc, &sch->sch_bucket, sc_hash) {
 			if (count >= max_pcbs) {
 				SCH_UNLOCK(sch);
 				goto exit;
 			}
 			if (cr_cansee(req->td->td_ucred, sc->sc_cred) != 0)
 				continue;
 			bzero(&xt, sizeof(xt));
 			xt.xt_len = sizeof(xt);
-			if (sc->sc_inc.inc_isipv6)
+			if (sc->sc_inc.inc_flags & INC_ISIPV6)
 				xt.xt_inp.inp_vflag = INP_IPV6;
 			else
 				xt.xt_inp.inp_vflag = INP_IPV4;
 			bcopy(&sc->sc_inc, &xt.xt_inp.inp_inc, sizeof (struct in_conninfo));
 			xt.xt_tp.t_inpcb = &xt.xt_inp;
 			xt.xt_tp.t_state = TCPS_SYN_RECEIVED;
 			xt.xt_socket.xso_protocol = IPPROTO_TCP;
 			xt.xt_socket.xso_len = sizeof (struct xsocket);
 			xt.xt_socket.so_type = SOCK_STREAM;
 			xt.xt_socket.so_state = SS_ISCONNECTING;
 			error = SYSCTL_OUT(req, &xt, sizeof xt);
 			if (error) {
 				SCH_UNLOCK(sch);
 				goto exit;
 			}
 			count++;
 		}
 		SCH_UNLOCK(sch);
 	}
 exit:
 	*pcbs_exported = count;
 	return error;
 }
Index: head/sys/netinet/tcp_timewait.c
===================================================================
--- head/sys/netinet/tcp_timewait.c	(revision 186221)
+++ head/sys/netinet/tcp_timewait.c	(revision 186222)
@@ -1,661 +1,661 @@
 /*-
  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)tcp_subr.c	8.2 (Berkeley) 5/24/95
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_mac.h"
 #include "opt_tcpdebug.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/callout.h>
 #include <sys/kernel.h>
 #include <sys/sysctl.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/protosw.h>
 #include <sys/random.h>
 #include <sys/vimage.h>
 
 #include <vm/uma.h>
 
 #include <net/route.h>
 #include <net/if.h>
 
 #include <netinet/in.h>
 #include <netinet/in_systm.h>
 #include <netinet/ip.h>
 #ifdef INET6
 #include <netinet/ip6.h>
 #endif
 #include <netinet/in_pcb.h>
 #ifdef INET6
 #include <netinet6/in6_pcb.h>
 #endif
 #include <netinet/in_var.h>
 #include <netinet/ip_var.h>
 #ifdef INET6
 #include <netinet6/ip6_var.h>
 #include <netinet6/scope6_var.h>
 #include <netinet6/nd6.h>
 #endif
 #include <netinet/ip_icmp.h>
 #include <netinet/tcp.h>
 #include <netinet/tcp_fsm.h>
 #include <netinet/tcp_seq.h>
 #include <netinet/tcp_timer.h>
 #include <netinet/tcp_var.h>
 #ifdef INET6
 #include <netinet6/tcp6_var.h>
 #endif
 #include <netinet/tcpip.h>
 #ifdef TCPDEBUG
 #include <netinet/tcp_debug.h>
 #endif
 #include <netinet6/ip6protosw.h>
 #include <netinet/vinet.h>
 
 #include <machine/in_cksum.h>
 
 #include <security/mac/mac_framework.h>
 
 static uma_zone_t tcptw_zone;
 static int	maxtcptw;
 
 /*
  * The timed wait queue contains references to each of the TCP sessions
  * currently in the TIME_WAIT state.  The queue pointers, including the
  * queue pointers in each tcptw structure, are protected using the global
  * tcbinfo lock, which must be held over queue iteration and modification.
  */
 #ifdef VIMAGE_GLOBALS
 static TAILQ_HEAD(, tcptw)	twq_2msl;
 int	nolocaltimewait;
 #endif
 
 static void	tcp_tw_2msl_reset(struct tcptw *, int);
 static void	tcp_tw_2msl_stop(struct tcptw *);
 
 static int
 tcptw_auto_size(void)
 {
 	INIT_VNET_INET(curvnet);
 	int halfrange;
 
 	/*
 	 * Max out at half the ephemeral port range so that TIME_WAIT
 	 * sockets don't tie up too many ephemeral ports.
 	 */
 	if (V_ipport_lastauto > V_ipport_firstauto)
 		halfrange = (V_ipport_lastauto - V_ipport_firstauto) / 2;
 	else
 		halfrange = (V_ipport_firstauto - V_ipport_lastauto) / 2;
 	/* Protect against goofy port ranges smaller than 32. */
 	return (imin(imax(halfrange, 32), maxsockets / 5));
 }
 
 static int
 sysctl_maxtcptw(SYSCTL_HANDLER_ARGS)
 {
 	int error, new;
 
 	if (maxtcptw == 0)
 		new = tcptw_auto_size();
 	else
 		new = maxtcptw;
 	error = sysctl_handle_int(oidp, &new, 0, req);
 	if (error == 0 && req->newptr)
 		if (new >= 32) {
 			maxtcptw = new;
 			uma_zone_set_max(tcptw_zone, maxtcptw);
 		}
 	return (error);
 }
 
 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, maxtcptw, CTLTYPE_INT|CTLFLAG_RW,
     &maxtcptw, 0, sysctl_maxtcptw, "IU",
     "Maximum number of compressed TCP TIME_WAIT entries");
 
 SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, nolocaltimewait,
     CTLFLAG_RW, nolocaltimewait, 0,
     "Do not create compressed TCP TIME_WAIT entries for local connections");
 
 void
 tcp_tw_zone_change(void)
 {
 
 	if (maxtcptw == 0)
 		uma_zone_set_max(tcptw_zone, tcptw_auto_size());
 }
 
 void
 tcp_tw_init(void)
 {
 	INIT_VNET_INET(curvnet);
 
 	tcptw_zone = uma_zcreate("tcptw", sizeof(struct tcptw),
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 	TUNABLE_INT_FETCH("net.inet.tcp.maxtcptw", &maxtcptw);
 	if (maxtcptw == 0)
 		uma_zone_set_max(tcptw_zone, tcptw_auto_size());
 	else
 		uma_zone_set_max(tcptw_zone, maxtcptw);
 	TAILQ_INIT(&V_twq_2msl);
 }
 
 /*
  * Move a TCP connection into TIME_WAIT state.
  *    tcbinfo is locked.
  *    inp is locked, and is unlocked before returning.
  */
 void
 tcp_twstart(struct tcpcb *tp)
 {
 #if defined(INVARIANTS) || defined(INVARIANT_SUPPORT)
 	INIT_VNET_INET(tp->t_vnet);
 #endif
 	struct tcptw *tw;
 	struct inpcb *inp = tp->t_inpcb;
 	int acknow;
 	struct socket *so;
 
 	INP_INFO_WLOCK_ASSERT(&V_tcbinfo);	/* tcp_tw_2msl_reset(). */
 	INP_WLOCK_ASSERT(inp);
 
 	if (V_nolocaltimewait && in_localip(inp->inp_faddr)) {
 		tp = tcp_close(tp);
 		if (tp != NULL)
 			INP_WUNLOCK(inp);
 		return;
 	}
 
 	tw = uma_zalloc(tcptw_zone, M_NOWAIT);
 	if (tw == NULL) {
 		tw = tcp_tw_2msl_scan(1);
 		if (tw == NULL) {
 			tp = tcp_close(tp);
 			if (tp != NULL)
 				INP_WUNLOCK(inp);
 			return;
 		}
 	}
 	tw->tw_inpcb = inp;
 
 	/*
 	 * Recover last window size sent.
 	 */
 	tw->last_win = (tp->rcv_adv - tp->rcv_nxt) >> tp->rcv_scale;
 
 	/*
 	 * Set t_recent if timestamps are used on the connection.
 	 */
 	if ((tp->t_flags & (TF_REQ_TSTMP|TF_RCVD_TSTMP|TF_NOOPT)) ==
 	    (TF_REQ_TSTMP|TF_RCVD_TSTMP)) {
 		tw->t_recent = tp->ts_recent;
 		tw->ts_offset = tp->ts_offset;
 	} else {
 		tw->t_recent = 0;
 		tw->ts_offset = 0;
 	}
 
 	tw->snd_nxt = tp->snd_nxt;
 	tw->rcv_nxt = tp->rcv_nxt;
 	tw->iss     = tp->iss;
 	tw->irs     = tp->irs;
 	tw->t_starttime = tp->t_starttime;
 	tw->tw_time = 0;
 
 /* XXX
  * If this code will
  * be used for fin-wait-2 state also, then we may need
  * a ts_recent from the last segment.
  */
 	acknow = tp->t_flags & TF_ACKNOW;
 
 	/*
 	 * First, discard tcpcb state, which includes stopping its timers and
 	 * freeing it.  tcp_discardcb() used to also release the inpcb, but
 	 * that work is now done in the caller.
 	 *
 	 * Note: soisdisconnected() call used to be made in tcp_discardcb(),
 	 * and might not be needed here any longer.
 	 */
 	tcp_discardcb(tp);
 	so = inp->inp_socket;
 	soisdisconnected(so);
 	tw->tw_cred = crhold(so->so_cred);
 	SOCK_LOCK(so);
 	tw->tw_so_options = so->so_options;
 	SOCK_UNLOCK(so);
 	if (acknow)
 		tcp_twrespond(tw, TH_ACK);
 	inp->inp_ppcb = tw;
 	inp->inp_vflag |= INP_TIMEWAIT;
 	tcp_tw_2msl_reset(tw, 0);
 
 	/*
 	 * If the inpcb owns the sole reference to the socket, then we can
 	 * detach and free the socket as it is not needed in time wait.
 	 */
 	if (inp->inp_vflag & INP_SOCKREF) {
 		KASSERT(so->so_state & SS_PROTOREF,
 		    ("tcp_twstart: !SS_PROTOREF"));
 		inp->inp_vflag &= ~INP_SOCKREF;
 		INP_WUNLOCK(inp);
 		ACCEPT_LOCK();
 		SOCK_LOCK(so);
 		so->so_state &= ~SS_PROTOREF;
 		sofree(so);
 	} else
 		INP_WUNLOCK(inp);
 }
 
 #if 0
 /*
  * The appromixate rate of ISN increase of Microsoft TCP stacks;
  * the actual rate is slightly higher due to the addition of
  * random positive increments.
  *
  * Most other new OSes use semi-randomized ISN values, so we
  * do not need to worry about them.
  */
 #define MS_ISN_BYTES_PER_SECOND		250000
 
 /*
  * Determine if the ISN we will generate has advanced beyond the last
  * sequence number used by the previous connection.  If so, indicate
  * that it is safe to recycle this tw socket by returning 1.
  */
 int
 tcp_twrecycleable(struct tcptw *tw)
 {
 	INIT_VNET_INET(curvnet);
 	tcp_seq new_iss = tw->iss;
 	tcp_seq new_irs = tw->irs;
 
 	INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
 	new_iss += (ticks - tw->t_starttime) * (ISN_BYTES_PER_SECOND / hz);
 	new_irs += (ticks - tw->t_starttime) * (MS_ISN_BYTES_PER_SECOND / hz);
 
 	if (SEQ_GT(new_iss, tw->snd_nxt) && SEQ_GT(new_irs, tw->rcv_nxt))
 		return (1);
 	else
 		return (0);
 }
 #endif
 
 /*
  * Returns 1 if the TIME_WAIT state was killed and we should start over,
  * looking for a pcb in the listen state.  Returns 0 otherwise.
  */
 int
 tcp_twcheck(struct inpcb *inp, struct tcpopt *to, struct tcphdr *th,
     struct mbuf *m, int tlen)
 {
 #if defined(INVARIANTS) || defined(INVARIANT_SUPPORT)
 	INIT_VNET_INET(curvnet);
 #endif
 	struct tcptw *tw;
 	int thflags;
 	tcp_seq seq;
 #ifdef INET6
 	int isipv6 = (mtod(m, struct ip *)->ip_v == 6) ? 1 : 0;
 #else
 	const int isipv6 = 0;
 #endif
 
 	/* tcbinfo lock required for tcp_twclose(), tcp_tw_2msl_reset(). */
 	INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
 	INP_WLOCK_ASSERT(inp);
 
 	/*
 	 * XXXRW: Time wait state for inpcb has been recycled, but inpcb is
 	 * still present.  This is undesirable, but temporarily necessary
 	 * until we work out how to handle inpcb's who's timewait state has
 	 * been removed.
 	 */
 	tw = intotw(inp);
 	if (tw == NULL)
 		goto drop;
 
 	thflags = th->th_flags;
 
 	/*
 	 * NOTE: for FIN_WAIT_2 (to be added later),
 	 * must validate sequence number before accepting RST
 	 */
 
 	/*
 	 * If the segment contains RST:
 	 *	Drop the segment - see Stevens, vol. 2, p. 964 and
 	 *      RFC 1337.
 	 */
 	if (thflags & TH_RST)
 		goto drop;
 
 #if 0
 /* PAWS not needed at the moment */
 	/*
 	 * RFC 1323 PAWS: If we have a timestamp reply on this segment
 	 * and it's less than ts_recent, drop it.
 	 */
 	if ((to.to_flags & TOF_TS) != 0 && tp->ts_recent &&
 	    TSTMP_LT(to.to_tsval, tp->ts_recent)) {
 		if ((thflags & TH_ACK) == 0)
 			goto drop;
 		goto ack;
 	}
 	/*
 	 * ts_recent is never updated because we never accept new segments.
 	 */
 #endif
 
 	/*
 	 * If a new connection request is received
 	 * while in TIME_WAIT, drop the old connection
 	 * and start over if the sequence numbers
 	 * are above the previous ones.
 	 */
 	if ((thflags & TH_SYN) && SEQ_GT(th->th_seq, tw->rcv_nxt)) {
 		tcp_twclose(tw, 0);
 		return (1);
 	}
 
 	/*
 	 * Drop the the segment if it does not contain an ACK.
 	 */
 	if ((thflags & TH_ACK) == 0)
 		goto drop;
 
 	/*
 	 * Reset the 2MSL timer if this is a duplicate FIN.
 	 */
 	if (thflags & TH_FIN) {
 		seq = th->th_seq + tlen + (thflags & TH_SYN ? 1 : 0);
 		if (seq + 1 == tw->rcv_nxt)
 			tcp_tw_2msl_reset(tw, 1);
 	}
 
 	/*
 	 * Acknowledge the segment if it has data or is not a duplicate ACK.
 	 */
 	if (thflags != TH_ACK || tlen != 0 ||
 	    th->th_seq != tw->rcv_nxt || th->th_ack != tw->snd_nxt)
 		tcp_twrespond(tw, TH_ACK);
 	goto drop;
 
 	/*
 	 * Generate a RST, dropping incoming segment.
 	 * Make ACK acceptable to originator of segment.
 	 * Don't bother to respond if destination was broadcast/multicast.
 	 */
 	if (m->m_flags & (M_BCAST|M_MCAST))
 		goto drop;
 	if (isipv6) {
 #ifdef INET6
 		struct ip6_hdr *ip6;
 
 		/* IPv6 anycast check is done at tcp6_input() */
 		ip6 = mtod(m, struct ip6_hdr *);
 		if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) ||
 		    IN6_IS_ADDR_MULTICAST(&ip6->ip6_src))
 			goto drop;
 #endif
 	} else {
 		struct ip *ip;
 
 		ip = mtod(m, struct ip *);
 		if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) ||
 		    IN_MULTICAST(ntohl(ip->ip_src.s_addr)) ||
 		    ip->ip_src.s_addr == htonl(INADDR_BROADCAST) ||
 		    in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif))
 			goto drop;
 	}
 	if (thflags & TH_ACK) {
 		tcp_respond(NULL,
 		    mtod(m, void *), th, m, 0, th->th_ack, TH_RST);
 	} else {
 		seq = th->th_seq + (thflags & TH_SYN ? 1 : 0);
 		tcp_respond(NULL,
 		    mtod(m, void *), th, m, seq, 0, TH_RST|TH_ACK);
 	}
 	INP_WUNLOCK(inp);
 	return (0);
 
 drop:
 	INP_WUNLOCK(inp);
 	m_freem(m);
 	return (0);
 }
 
 void
 tcp_twclose(struct tcptw *tw, int reuse)
 {
 	INIT_VNET_INET(curvnet);
 	struct socket *so;
 	struct inpcb *inp;
 
 	/*
 	 * At this point, we are in one of two situations:
 	 *
 	 * (1) We have no socket, just an inpcb<->twtcp pair.  We can free
 	 *     all state.
 	 *
 	 * (2) We have a socket -- if we own a reference, release it and
 	 *     notify the socket layer.
 	 */
 	inp = tw->tw_inpcb;
 	KASSERT((inp->inp_vflag & INP_TIMEWAIT), ("tcp_twclose: !timewait"));
 	KASSERT(intotw(inp) == tw, ("tcp_twclose: inp_ppcb != tw"));
 	INP_INFO_WLOCK_ASSERT(&V_tcbinfo);	/* tcp_tw_2msl_stop(). */
 	INP_WLOCK_ASSERT(inp);
 
 	tw->tw_inpcb = NULL;
 	tcp_tw_2msl_stop(tw);
 	inp->inp_ppcb = NULL;
 	in_pcbdrop(inp);
 
 	so = inp->inp_socket;
 	if (so != NULL) {
 		/*
 		 * If there's a socket, handle two cases: first, we own a
 		 * strong reference, which we will now release, or we don't
 		 * in which case another reference exists (XXXRW: think
 		 * about this more), and we don't need to take action.
 		 */
 		if (inp->inp_vflag & INP_SOCKREF) {
 			inp->inp_vflag &= ~INP_SOCKREF;
 			INP_WUNLOCK(inp);
 			ACCEPT_LOCK();
 			SOCK_LOCK(so);
 			KASSERT(so->so_state & SS_PROTOREF,
 			    ("tcp_twclose: INP_SOCKREF && !SS_PROTOREF"));
 			so->so_state &= ~SS_PROTOREF;
 			sofree(so);
 		} else {
 			/*
 			 * If we don't own the only reference, the socket and
 			 * inpcb need to be left around to be handled by
 			 * tcp_usr_detach() later.
 			 */
 			INP_WUNLOCK(inp);
 		}
 	} else
 		in_pcbfree(inp);
 	V_tcpstat.tcps_closed++;
 	crfree(tw->tw_cred);
 	tw->tw_cred = NULL;
 	if (reuse)
 		return;
 	uma_zfree(tcptw_zone, tw);
 }
 
 int
 tcp_twrespond(struct tcptw *tw, int flags)
 {
 	INIT_VNET_INET(curvnet);
 	struct inpcb *inp = tw->tw_inpcb;
 	struct tcphdr *th;
 	struct mbuf *m;
 	struct ip *ip = NULL;
 	u_int hdrlen, optlen;
 	int error;
 	struct tcpopt to;
 #ifdef INET6
 	struct ip6_hdr *ip6 = NULL;
-	int isipv6 = inp->inp_inc.inc_isipv6;
+	int isipv6 = inp->inp_inc.inc_flags & INC_ISIPV6;
 #endif
 
 	INP_WLOCK_ASSERT(inp);
 
 	m = m_gethdr(M_DONTWAIT, MT_DATA);
 	if (m == NULL)
 		return (ENOBUFS);
 	m->m_data += max_linkhdr;
 
 #ifdef MAC
 	mac_inpcb_create_mbuf(inp, m);
 #endif
 
 #ifdef INET6
 	if (isipv6) {
 		hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
 		ip6 = mtod(m, struct ip6_hdr *);
 		th = (struct tcphdr *)(ip6 + 1);
 		tcpip_fillheaders(inp, ip6, th);
 	} else
 #endif
 	{
 		hdrlen = sizeof(struct tcpiphdr);
 		ip = mtod(m, struct ip *);
 		th = (struct tcphdr *)(ip + 1);
 		tcpip_fillheaders(inp, ip, th);
 	}
 	to.to_flags = 0;
 
 	/*
 	 * Send a timestamp and echo-reply if both our side and our peer
 	 * have sent timestamps in our SYN's and this is not a RST.
 	 */
 	if (tw->t_recent && flags == TH_ACK) {
 		to.to_flags |= TOF_TS;
 		to.to_tsval = ticks + tw->ts_offset;
 		to.to_tsecr = tw->t_recent;
 	}
 	optlen = tcp_addoptions(&to, (u_char *)(th + 1));
 
 	m->m_len = hdrlen + optlen;
 	m->m_pkthdr.len = m->m_len;
 
 	KASSERT(max_linkhdr + m->m_len <= MHLEN, ("tcptw: mbuf too small"));
 
 	th->th_seq = htonl(tw->snd_nxt);
 	th->th_ack = htonl(tw->rcv_nxt);
 	th->th_off = (sizeof(struct tcphdr) + optlen) >> 2;
 	th->th_flags = flags;
 	th->th_win = htons(tw->last_win);
 
 #ifdef INET6
 	if (isipv6) {
 		th->th_sum = in6_cksum(m, IPPROTO_TCP, sizeof(struct ip6_hdr),
 		    sizeof(struct tcphdr) + optlen);
 		ip6->ip6_hlim = in6_selecthlim(inp, NULL);
 		error = ip6_output(m, inp->in6p_outputopts, NULL,
 		    (tw->tw_so_options & SO_DONTROUTE), NULL, NULL, inp);
 	} else
 #endif
 	{
 		th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
 		    htons(sizeof(struct tcphdr) + optlen + IPPROTO_TCP));
 		m->m_pkthdr.csum_flags = CSUM_TCP;
 		m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
 		ip->ip_len = m->m_pkthdr.len;
 		if (V_path_mtu_discovery)
 			ip->ip_off |= IP_DF;
 		error = ip_output(m, inp->inp_options, NULL,
 		    ((tw->tw_so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0),
 		    NULL, inp);
 	}
 	if (flags & TH_ACK)
 		V_tcpstat.tcps_sndacks++;
 	else
 		V_tcpstat.tcps_sndctrl++;
 	V_tcpstat.tcps_sndtotal++;
 	return (error);
 }
 
 static void
 tcp_tw_2msl_reset(struct tcptw *tw, int rearm)
 {
 	INIT_VNET_INET(curvnet);
 
 	INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
 	INP_WLOCK_ASSERT(tw->tw_inpcb);
 	if (rearm)
 		TAILQ_REMOVE(&V_twq_2msl, tw, tw_2msl);
 	tw->tw_time = ticks + 2 * tcp_msl;
 	TAILQ_INSERT_TAIL(&V_twq_2msl, tw, tw_2msl);
 }
 
 static void
 tcp_tw_2msl_stop(struct tcptw *tw)
 {
 	INIT_VNET_INET(curvnet);
 
 	INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
 	TAILQ_REMOVE(&V_twq_2msl, tw, tw_2msl);
 }
 
 struct tcptw *
 tcp_tw_2msl_scan(int reuse)
 {
 	INIT_VNET_INET(curvnet);
 	struct tcptw *tw;
 
 	INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
 	for (;;) {
 		tw = TAILQ_FIRST(&V_twq_2msl);
 		if (tw == NULL || (!reuse && tw->tw_time > ticks))
 			break;
 		INP_WLOCK(tw->tw_inpcb);
 		tcp_twclose(tw, reuse);
 		if (reuse)
 			return (tw);
 	}
 	return (NULL);
 }
Index: head/sys/netinet/tcp_usrreq.c
===================================================================
--- head/sys/netinet/tcp_usrreq.c	(revision 186221)
+++ head/sys/netinet/tcp_usrreq.c	(revision 186222)
@@ -1,1898 +1,1898 @@
 /*-
  * Copyright (c) 1982, 1986, 1988, 1993
  *	The Regents of the University of California.
  * Copyright (c) 2006-2007 Robert N. M. Watson
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	From: @(#)tcp_usrreq.c	8.2 (Berkeley) 1/3/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ddb.h"
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_tcpdebug.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/malloc.h>
 #include <sys/kernel.h>
 #include <sys/sysctl.h>
 #include <sys/mbuf.h>
 #ifdef INET6
 #include <sys/domain.h>
 #endif /* INET6 */
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/protosw.h>
 #include <sys/proc.h>
 #include <sys/jail.h>
 #include <sys/vimage.h>
 
 #ifdef DDB
 #include <ddb/ddb.h>
 #endif
 
 #include <net/if.h>
 #include <net/route.h>
 
 #include <netinet/in.h>
 #include <netinet/in_systm.h>
 #ifdef INET6
 #include <netinet/ip6.h>
 #endif
 #include <netinet/in_pcb.h>
 #ifdef INET6
 #include <netinet6/in6_pcb.h>
 #endif
 #include <netinet/in_var.h>
 #include <netinet/ip_var.h>
 #ifdef INET6
 #include <netinet6/ip6_var.h>
 #include <netinet6/scope6_var.h>
 #endif
 #include <netinet/tcp.h>
 #include <netinet/tcp_fsm.h>
 #include <netinet/tcp_seq.h>
 #include <netinet/tcp_timer.h>
 #include <netinet/tcp_var.h>
 #include <netinet/tcpip.h>
 #ifdef TCPDEBUG
 #include <netinet/tcp_debug.h>
 #endif
 #include <netinet/tcp_offload.h>
 #include <netinet/vinet.h>
 
 /*
  * TCP protocol interface to socket abstraction.
  */
 static int	tcp_attach(struct socket *);
 static int	tcp_connect(struct tcpcb *, struct sockaddr *,
 		    struct thread *td);
 #ifdef INET6
 static int	tcp6_connect(struct tcpcb *, struct sockaddr *,
 		    struct thread *td);
 #endif /* INET6 */
 static void	tcp_disconnect(struct tcpcb *);
 static void	tcp_usrclosed(struct tcpcb *);
 static void	tcp_fill_info(struct tcpcb *, struct tcp_info *);
 
 #ifdef TCPDEBUG
 #define	TCPDEBUG0	int ostate = 0
 #define	TCPDEBUG1()	ostate = tp ? tp->t_state : 0
 #define	TCPDEBUG2(req)	if (tp && (so->so_options & SO_DEBUG)) \
 				tcp_trace(TA_USER, ostate, tp, 0, 0, req)
 #else
 #define	TCPDEBUG0
 #define	TCPDEBUG1()
 #define	TCPDEBUG2(req)
 #endif
 
 /*
  * TCP attaches to socket via pru_attach(), reserving space,
  * and an internet control block.
  */
 static int
 tcp_usr_attach(struct socket *so, int proto, struct thread *td)
 {
 	struct inpcb *inp;
 	struct tcpcb *tp = NULL;
 	int error;
 	TCPDEBUG0;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp == NULL, ("tcp_usr_attach: inp != NULL"));
 	TCPDEBUG1();
 
 	error = tcp_attach(so);
 	if (error)
 		goto out;
 
 	if ((so->so_options & SO_LINGER) && so->so_linger == 0)
 		so->so_linger = TCP_LINGERTIME;
 
 	inp = sotoinpcb(so);
 	tp = intotcpcb(inp);
 out:
 	TCPDEBUG2(PRU_ATTACH);
 	return error;
 }
 
 /*
  * tcp_detach is called when the socket layer loses its final reference
  * to the socket, be it a file descriptor reference, a reference from TCP,
  * etc.  At this point, there is only one case in which we will keep around
  * inpcb state: time wait.
  *
  * This function can probably be re-absorbed back into tcp_usr_detach() now
  * that there is a single detach path.
  */
 static void
 tcp_detach(struct socket *so, struct inpcb *inp)
 {
 	struct tcpcb *tp;
 #ifdef INVARIANTS
 	INIT_VNET_INET(so->so_vnet);
 #endif
 
 	INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
 	INP_WLOCK_ASSERT(inp);
 
 	KASSERT(so->so_pcb == inp, ("tcp_detach: so_pcb != inp"));
 	KASSERT(inp->inp_socket == so, ("tcp_detach: inp_socket != so"));
 
 	tp = intotcpcb(inp);
 
 	if (inp->inp_vflag & INP_TIMEWAIT) {
 		/*
 		 * There are two cases to handle: one in which the time wait
 		 * state is being discarded (INP_DROPPED), and one in which
 		 * this connection will remain in timewait.  In the former,
 		 * it is time to discard all state (except tcptw, which has
 		 * already been discarded by the timewait close code, which
 		 * should be further up the call stack somewhere).  In the
 		 * latter case, we detach from the socket, but leave the pcb
 		 * present until timewait ends.
 		 *
 		 * XXXRW: Would it be cleaner to free the tcptw here?
 		 */
 		if (inp->inp_vflag & INP_DROPPED) {
 			KASSERT(tp == NULL, ("tcp_detach: INP_TIMEWAIT && "
 			    "INP_DROPPED && tp != NULL"));
 			in_pcbdetach(inp);
 			in_pcbfree(inp);
 		} else {
 			in_pcbdetach(inp);
 			INP_WUNLOCK(inp);
 		}
 	} else {
 		/*
 		 * If the connection is not in timewait, we consider two
 		 * two conditions: one in which no further processing is
 		 * necessary (dropped || embryonic), and one in which TCP is
 		 * not yet done, but no longer requires the socket, so the
 		 * pcb will persist for the time being.
 		 *
 		 * XXXRW: Does the second case still occur?
 		 */
 		if (inp->inp_vflag & INP_DROPPED ||
 		    tp->t_state < TCPS_SYN_SENT) {
 			tcp_discardcb(tp);
 			in_pcbdetach(inp);
 			in_pcbfree(inp);
 		} else
 			in_pcbdetach(inp);
 	}
 }
 
 /*
  * pru_detach() detaches the TCP protocol from the socket.
  * If the protocol state is non-embryonic, then can't
  * do this directly: have to initiate a pru_disconnect(),
  * which may finish later; embryonic TCB's can just
  * be discarded here.
  */
 static void
 tcp_usr_detach(struct socket *so)
 {
 	INIT_VNET_INET(so->so_vnet);
 	struct inpcb *inp;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("tcp_usr_detach: inp == NULL"));
 	INP_INFO_WLOCK(&V_tcbinfo);
 	INP_WLOCK(inp);
 	KASSERT(inp->inp_socket != NULL,
 	    ("tcp_usr_detach: inp_socket == NULL"));
 	tcp_detach(so, inp);
 	INP_INFO_WUNLOCK(&V_tcbinfo);
 }
 
 /*
  * Give the socket an address.
  */
 static int
 tcp_usr_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
 {
 	INIT_VNET_INET(so->so_vnet);
 	int error = 0;
 	struct inpcb *inp;
 	struct tcpcb *tp = NULL;
 	struct sockaddr_in *sinp;
 
 	sinp = (struct sockaddr_in *)nam;
 	if (nam->sa_len != sizeof (*sinp))
 		return (EINVAL);
 	/*
 	 * Must check for multicast addresses and disallow binding
 	 * to them.
 	 */
 	if (sinp->sin_family == AF_INET &&
 	    IN_MULTICAST(ntohl(sinp->sin_addr.s_addr)))
 		return (EAFNOSUPPORT);
 
 	TCPDEBUG0;
 	INP_INFO_WLOCK(&V_tcbinfo);
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("tcp_usr_bind: inp == NULL"));
 	INP_WLOCK(inp);
 	if (inp->inp_vflag & (INP_TIMEWAIT | INP_DROPPED)) {
 		error = EINVAL;
 		goto out;
 	}
 	tp = intotcpcb(inp);
 	TCPDEBUG1();
 	error = in_pcbbind(inp, nam, td->td_ucred);
 out:
 	TCPDEBUG2(PRU_BIND);
 	INP_WUNLOCK(inp);
 	INP_INFO_WUNLOCK(&V_tcbinfo);
 
 	return (error);
 }
 
 #ifdef INET6
 static int
 tcp6_usr_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
 {
 	INIT_VNET_INET(so->so_vnet);
 	int error = 0;
 	struct inpcb *inp;
 	struct tcpcb *tp = NULL;
 	struct sockaddr_in6 *sin6p;
 
 	sin6p = (struct sockaddr_in6 *)nam;
 	if (nam->sa_len != sizeof (*sin6p))
 		return (EINVAL);
 	/*
 	 * Must check for multicast addresses and disallow binding
 	 * to them.
 	 */
 	if (sin6p->sin6_family == AF_INET6 &&
 	    IN6_IS_ADDR_MULTICAST(&sin6p->sin6_addr))
 		return (EAFNOSUPPORT);
 
 	TCPDEBUG0;
 	INP_INFO_WLOCK(&V_tcbinfo);
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("tcp6_usr_bind: inp == NULL"));
 	INP_WLOCK(inp);
 	if (inp->inp_vflag & (INP_TIMEWAIT | INP_DROPPED)) {
 		error = EINVAL;
 		goto out;
 	}
 	tp = intotcpcb(inp);
 	TCPDEBUG1();
 	inp->inp_vflag &= ~INP_IPV4;
 	inp->inp_vflag |= INP_IPV6;
 	if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) {
 		if (IN6_IS_ADDR_UNSPECIFIED(&sin6p->sin6_addr))
 			inp->inp_vflag |= INP_IPV4;
 		else if (IN6_IS_ADDR_V4MAPPED(&sin6p->sin6_addr)) {
 			struct sockaddr_in sin;
 
 			in6_sin6_2_sin(&sin, sin6p);
 			inp->inp_vflag |= INP_IPV4;
 			inp->inp_vflag &= ~INP_IPV6;
 			error = in_pcbbind(inp, (struct sockaddr *)&sin,
 			    td->td_ucred);
 			goto out;
 		}
 	}
 	error = in6_pcbbind(inp, nam, td->td_ucred);
 out:
 	TCPDEBUG2(PRU_BIND);
 	INP_WUNLOCK(inp);
 	INP_INFO_WUNLOCK(&V_tcbinfo);
 	return (error);
 }
 #endif /* INET6 */
 
 /*
  * Prepare to accept connections.
  */
 static int
 tcp_usr_listen(struct socket *so, int backlog, struct thread *td)
 {
 	INIT_VNET_INET(so->so_vnet);
 	int error = 0;
 	struct inpcb *inp;
 	struct tcpcb *tp = NULL;
 
 	TCPDEBUG0;
 	INP_INFO_WLOCK(&V_tcbinfo);
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("tcp_usr_listen: inp == NULL"));
 	INP_WLOCK(inp);
 	if (inp->inp_vflag & (INP_TIMEWAIT | INP_DROPPED)) {
 		error = EINVAL;
 		goto out;
 	}
 	tp = intotcpcb(inp);
 	TCPDEBUG1();
 	SOCK_LOCK(so);
 	error = solisten_proto_check(so);
 	if (error == 0 && inp->inp_lport == 0)
 		error = in_pcbbind(inp, (struct sockaddr *)0, td->td_ucred);
 	if (error == 0) {
 		tp->t_state = TCPS_LISTEN;
 		solisten_proto(so, backlog);
 		tcp_offload_listen_open(tp);
 	}
 	SOCK_UNLOCK(so);
 
 out:
 	TCPDEBUG2(PRU_LISTEN);
 	INP_WUNLOCK(inp);
 	INP_INFO_WUNLOCK(&V_tcbinfo);
 	return (error);
 }
 
 #ifdef INET6
 static int
 tcp6_usr_listen(struct socket *so, int backlog, struct thread *td)
 {
 	INIT_VNET_INET(so->so_vnet);
 	int error = 0;
 	struct inpcb *inp;
 	struct tcpcb *tp = NULL;
 
 	TCPDEBUG0;
 	INP_INFO_WLOCK(&V_tcbinfo);
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("tcp6_usr_listen: inp == NULL"));
 	INP_WLOCK(inp);
 	if (inp->inp_vflag & (INP_TIMEWAIT | INP_DROPPED)) {
 		error = EINVAL;
 		goto out;
 	}
 	tp = intotcpcb(inp);
 	TCPDEBUG1();
 	SOCK_LOCK(so);
 	error = solisten_proto_check(so);
 	if (error == 0 && inp->inp_lport == 0) {
 		inp->inp_vflag &= ~INP_IPV4;
 		if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0)
 			inp->inp_vflag |= INP_IPV4;
 		error = in6_pcbbind(inp, (struct sockaddr *)0, td->td_ucred);
 	}
 	if (error == 0) {
 		tp->t_state = TCPS_LISTEN;
 		solisten_proto(so, backlog);
 	}
 	SOCK_UNLOCK(so);
 
 out:
 	TCPDEBUG2(PRU_LISTEN);
 	INP_WUNLOCK(inp);
 	INP_INFO_WUNLOCK(&V_tcbinfo);
 	return (error);
 }
 #endif /* INET6 */
 
 /*
  * Initiate connection to peer.
  * Create a template for use in transmissions on this connection.
  * Enter SYN_SENT state, and mark socket as connecting.
  * Start keep-alive timer, and seed output sequence space.
  * Send initial segment on connection.
  */
 static int
 tcp_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
 {
 	INIT_VNET_INET(so->so_vnet);
 	int error = 0;
 	struct inpcb *inp;
 	struct tcpcb *tp = NULL;
 	struct sockaddr_in *sinp;
 
 	sinp = (struct sockaddr_in *)nam;
 	if (nam->sa_len != sizeof (*sinp))
 		return (EINVAL);
 	/*
 	 * Must disallow TCP ``connections'' to multicast addresses.
 	 */
 	if (sinp->sin_family == AF_INET
 	    && IN_MULTICAST(ntohl(sinp->sin_addr.s_addr)))
 		return (EAFNOSUPPORT);
 	if (prison_remote_ip4(td->td_ucred, &sinp->sin_addr) != 0)
 		return (EINVAL);
 
 	TCPDEBUG0;
 	INP_INFO_WLOCK(&V_tcbinfo);
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("tcp_usr_connect: inp == NULL"));
 	INP_WLOCK(inp);
 	if (inp->inp_vflag & (INP_TIMEWAIT | INP_DROPPED)) {
 		error = EINVAL;
 		goto out;
 	}
 	tp = intotcpcb(inp);
 	TCPDEBUG1();
 	if ((error = tcp_connect(tp, nam, td)) != 0)
 		goto out;
 	error = tcp_output_connect(so, nam);
 out:
 	TCPDEBUG2(PRU_CONNECT);
 	INP_WUNLOCK(inp);
 	INP_INFO_WUNLOCK(&V_tcbinfo);
 	return (error);
 }
 
 #ifdef INET6
 static int
 tcp6_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
 {
 	INIT_VNET_INET(so->so_vnet);
 	int error = 0;
 	struct inpcb *inp;
 	struct tcpcb *tp = NULL;
 	struct sockaddr_in6 *sin6p;
 
 	TCPDEBUG0;
 
 	sin6p = (struct sockaddr_in6 *)nam;
 	if (nam->sa_len != sizeof (*sin6p))
 		return (EINVAL);
 	/*
 	 * Must disallow TCP ``connections'' to multicast addresses.
 	 */
 	if (sin6p->sin6_family == AF_INET6
 	    && IN6_IS_ADDR_MULTICAST(&sin6p->sin6_addr))
 		return (EAFNOSUPPORT);
 
 	INP_INFO_WLOCK(&V_tcbinfo);
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("tcp6_usr_connect: inp == NULL"));
 	INP_WLOCK(inp);
 	if (inp->inp_vflag & (INP_TIMEWAIT | INP_DROPPED)) {
 		error = EINVAL;
 		goto out;
 	}
 	tp = intotcpcb(inp);
 	TCPDEBUG1();
 	if (IN6_IS_ADDR_V4MAPPED(&sin6p->sin6_addr)) {
 		struct sockaddr_in sin;
 
 		if ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0) {
 			error = EINVAL;
 			goto out;
 		}
 
 		in6_sin6_2_sin(&sin, sin6p);
 		inp->inp_vflag |= INP_IPV4;
 		inp->inp_vflag &= ~INP_IPV6;
 		if (prison_remote_ip4(td->td_ucred, &sin.sin_addr) != 0) {
 			error = EINVAL;
 			goto out;
 		}
 		if ((error = tcp_connect(tp, (struct sockaddr *)&sin, td)) != 0)
 			goto out;
 		error = tcp_output_connect(so, nam);
 		goto out;
 	}
 	inp->inp_vflag &= ~INP_IPV4;
 	inp->inp_vflag |= INP_IPV6;
-	inp->inp_inc.inc_isipv6 = 1;
+	inp->inp_inc.inc_flags |= INC_ISIPV6;
 	if (prison_remote_ip6(td->td_ucred, &sin6p->sin6_addr) != 0) {
 		error = EINVAL;
 		goto out;
 	}
 	if ((error = tcp6_connect(tp, nam, td)) != 0)
 		goto out;
 	error = tcp_output_connect(so, nam);
 
 out:
 	TCPDEBUG2(PRU_CONNECT);
 	INP_WUNLOCK(inp);
 	INP_INFO_WUNLOCK(&V_tcbinfo);
 	return (error);
 }
 #endif /* INET6 */
 
 /*
  * Initiate disconnect from peer.
  * If connection never passed embryonic stage, just drop;
  * else if don't need to let data drain, then can just drop anyways,
  * else have to begin TCP shutdown process: mark socket disconnecting,
  * drain unread data, state switch to reflect user close, and
  * send segment (e.g. FIN) to peer.  Socket will be really disconnected
  * when peer sends FIN and acks ours.
  *
  * SHOULD IMPLEMENT LATER PRU_CONNECT VIA REALLOC TCPCB.
  */
 static int
 tcp_usr_disconnect(struct socket *so)
 {
 	INIT_VNET_INET(so->so_vnet);
 	struct inpcb *inp;
 	struct tcpcb *tp = NULL;
 	int error = 0;
 
 	TCPDEBUG0;
 	INP_INFO_WLOCK(&V_tcbinfo);
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("tcp_usr_disconnect: inp == NULL"));
 	INP_WLOCK(inp);
 	if (inp->inp_vflag & (INP_TIMEWAIT | INP_DROPPED)) {
 		error = ECONNRESET;
 		goto out;
 	}
 	tp = intotcpcb(inp);
 	TCPDEBUG1();
 	tcp_disconnect(tp);
 out:
 	TCPDEBUG2(PRU_DISCONNECT);
 	INP_WUNLOCK(inp);
 	INP_INFO_WUNLOCK(&V_tcbinfo);
 	return (error);
 }
 
 /*
  * Accept a connection.  Essentially all the work is
  * done at higher levels; just return the address
  * of the peer, storing through addr.
  */
 static int
 tcp_usr_accept(struct socket *so, struct sockaddr **nam)
 {
 	INIT_VNET_INET(so->so_vnet);
 	int error = 0;
 	struct inpcb *inp = NULL;
 	struct tcpcb *tp = NULL;
 	struct in_addr addr;
 	in_port_t port = 0;
 	TCPDEBUG0;
 
 	if (so->so_state & SS_ISDISCONNECTED)
 		return (ECONNABORTED);
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("tcp_usr_accept: inp == NULL"));
 	INP_INFO_RLOCK(&V_tcbinfo);
 	INP_WLOCK(inp);
 	if (inp->inp_vflag & (INP_TIMEWAIT | INP_DROPPED)) {
 		error = ECONNABORTED;
 		goto out;
 	}
 	tp = intotcpcb(inp);
 	TCPDEBUG1();
 
 	/*
 	 * We inline in_getpeeraddr and COMMON_END here, so that we can
 	 * copy the data of interest and defer the malloc until after we
 	 * release the lock.
 	 */
 	port = inp->inp_fport;
 	addr = inp->inp_faddr;
 
 out:
 	TCPDEBUG2(PRU_ACCEPT);
 	INP_WUNLOCK(inp);
 	INP_INFO_RUNLOCK(&V_tcbinfo);
 	if (error == 0)
 		*nam = in_sockaddr(port, &addr);
 	return error;
 }
 
 #ifdef INET6
 static int
 tcp6_usr_accept(struct socket *so, struct sockaddr **nam)
 {
 	struct inpcb *inp = NULL;
 	int error = 0;
 	struct tcpcb *tp = NULL;
 	struct in_addr addr;
 	struct in6_addr addr6;
 	in_port_t port = 0;
 	int v4 = 0;
 	TCPDEBUG0;
 
 	if (so->so_state & SS_ISDISCONNECTED)
 		return (ECONNABORTED);
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("tcp6_usr_accept: inp == NULL"));
 	INP_WLOCK(inp);
 	if (inp->inp_vflag & (INP_TIMEWAIT | INP_DROPPED)) {
 		error = ECONNABORTED;
 		goto out;
 	}
 	tp = intotcpcb(inp);
 	TCPDEBUG1();
 
 	/*
 	 * We inline in6_mapped_peeraddr and COMMON_END here, so that we can
 	 * copy the data of interest and defer the malloc until after we
 	 * release the lock.
 	 */
 	if (inp->inp_vflag & INP_IPV4) {
 		v4 = 1;
 		port = inp->inp_fport;
 		addr = inp->inp_faddr;
 	} else {
 		port = inp->inp_fport;
 		addr6 = inp->in6p_faddr;
 	}
 
 out:
 	TCPDEBUG2(PRU_ACCEPT);
 	INP_WUNLOCK(inp);
 	if (error == 0) {
 		if (v4)
 			*nam = in6_v4mapsin6_sockaddr(port, &addr);
 		else
 			*nam = in6_sockaddr(port, &addr6);
 	}
 	return error;
 }
 #endif /* INET6 */
 
 /*
  * Mark the connection as being incapable of further output.
  */
 static int
 tcp_usr_shutdown(struct socket *so)
 {
 	INIT_VNET_INET(so->so_vnet);
 	int error = 0;
 	struct inpcb *inp;
 	struct tcpcb *tp = NULL;
 
 	TCPDEBUG0;
 	INP_INFO_WLOCK(&V_tcbinfo);
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("inp == NULL"));
 	INP_WLOCK(inp);
 	if (inp->inp_vflag & (INP_TIMEWAIT | INP_DROPPED)) {
 		error = ECONNRESET;
 		goto out;
 	}
 	tp = intotcpcb(inp);
 	TCPDEBUG1();
 	socantsendmore(so);
 	tcp_usrclosed(tp);
 	error = tcp_output_disconnect(tp);
 
 out:
 	TCPDEBUG2(PRU_SHUTDOWN);
 	INP_WUNLOCK(inp);
 	INP_INFO_WUNLOCK(&V_tcbinfo);
 
 	return (error);
 }
 
 /*
  * After a receive, possibly send window update to peer.
  */
 static int
 tcp_usr_rcvd(struct socket *so, int flags)
 {
 	struct inpcb *inp;
 	struct tcpcb *tp = NULL;
 	int error = 0;
 
 	TCPDEBUG0;
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("tcp_usr_rcvd: inp == NULL"));
 	INP_WLOCK(inp);
 	if (inp->inp_vflag & (INP_TIMEWAIT | INP_DROPPED)) {
 		error = ECONNRESET;
 		goto out;
 	}
 	tp = intotcpcb(inp);
 	TCPDEBUG1();
 	tcp_output_rcvd(tp);
 
 out:
 	TCPDEBUG2(PRU_RCVD);
 	INP_WUNLOCK(inp);
 	return (error);
 }
 
 /*
  * Do a send by putting data in output queue and updating urgent
  * marker if URG set.  Possibly send more data.  Unlike the other
  * pru_*() routines, the mbuf chains are our responsibility.  We
  * must either enqueue them or free them.  The other pru_* routines
  * generally are caller-frees.
  */
 static int
 tcp_usr_send(struct socket *so, int flags, struct mbuf *m,
     struct sockaddr *nam, struct mbuf *control, struct thread *td)
 {
 	INIT_VNET_INET(so->so_vnet);
 	int error = 0;
 	struct inpcb *inp;
 	struct tcpcb *tp = NULL;
 	int headlocked = 0;
 #ifdef INET6
 	int isipv6;
 #endif
 	TCPDEBUG0;
 
 	/*
 	 * We require the pcbinfo lock in two cases:
 	 *
 	 * (1) An implied connect is taking place, which can result in
 	 *     binding IPs and ports and hence modification of the pcb hash
 	 *     chains.
 	 *
 	 * (2) PRUS_EOF is set, resulting in explicit close on the send.
 	 */
 	if ((nam != NULL) || (flags & PRUS_EOF)) {
 		INP_INFO_WLOCK(&V_tcbinfo);
 		headlocked = 1;
 	}
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("tcp_usr_send: inp == NULL"));
 	INP_WLOCK(inp);
 	if (inp->inp_vflag & (INP_TIMEWAIT | INP_DROPPED)) {
 		if (control)
 			m_freem(control);
 		if (m)
 			m_freem(m);
 		error = ECONNRESET;
 		goto out;
 	}
 #ifdef INET6
 	isipv6 = nam && nam->sa_family == AF_INET6;
 #endif /* INET6 */
 	tp = intotcpcb(inp);
 	TCPDEBUG1();
 	if (control) {
 		/* TCP doesn't do control messages (rights, creds, etc) */
 		if (control->m_len) {
 			m_freem(control);
 			if (m)
 				m_freem(m);
 			error = EINVAL;
 			goto out;
 		}
 		m_freem(control);	/* empty control, just free it */
 	}
 	if (!(flags & PRUS_OOB)) {
 		sbappendstream(&so->so_snd, m);
 		if (nam && tp->t_state < TCPS_SYN_SENT) {
 			/*
 			 * Do implied connect if not yet connected,
 			 * initialize window to default value, and
 			 * initialize maxseg/maxopd using peer's cached
 			 * MSS.
 			 */
 			INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
 #ifdef INET6
 			if (isipv6)
 				error = tcp6_connect(tp, nam, td);
 			else
 #endif /* INET6 */
 			error = tcp_connect(tp, nam, td);
 			if (error)
 				goto out;
 			tp->snd_wnd = TTCP_CLIENT_SND_WND;
 			tcp_mss(tp, -1);
 		}
 		if (flags & PRUS_EOF) {
 			/*
 			 * Close the send side of the connection after
 			 * the data is sent.
 			 */
 			INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
 			socantsendmore(so);
 			tcp_usrclosed(tp);
 		}
 		if (headlocked) {
 			INP_INFO_WUNLOCK(&V_tcbinfo);
 			headlocked = 0;
 		}
 		if (tp != NULL) {
 			if (flags & PRUS_MORETOCOME)
 				tp->t_flags |= TF_MORETOCOME;
 			error = tcp_output_send(tp);
 			if (flags & PRUS_MORETOCOME)
 				tp->t_flags &= ~TF_MORETOCOME;
 		}
 	} else {
 		/*
 		 * XXXRW: PRUS_EOF not implemented with PRUS_OOB?
 		 */
 		SOCKBUF_LOCK(&so->so_snd);
 		if (sbspace(&so->so_snd) < -512) {
 			SOCKBUF_UNLOCK(&so->so_snd);
 			m_freem(m);
 			error = ENOBUFS;
 			goto out;
 		}
 		/*
 		 * According to RFC961 (Assigned Protocols),
 		 * the urgent pointer points to the last octet
 		 * of urgent data.  We continue, however,
 		 * to consider it to indicate the first octet
 		 * of data past the urgent section.
 		 * Otherwise, snd_up should be one lower.
 		 */
 		sbappendstream_locked(&so->so_snd, m);
 		SOCKBUF_UNLOCK(&so->so_snd);
 		if (nam && tp->t_state < TCPS_SYN_SENT) {
 			/*
 			 * Do implied connect if not yet connected,
 			 * initialize window to default value, and
 			 * initialize maxseg/maxopd using peer's cached
 			 * MSS.
 			 */
 			INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
 #ifdef INET6
 			if (isipv6)
 				error = tcp6_connect(tp, nam, td);
 			else
 #endif /* INET6 */
 			error = tcp_connect(tp, nam, td);
 			if (error)
 				goto out;
 			tp->snd_wnd = TTCP_CLIENT_SND_WND;
 			tcp_mss(tp, -1);
 			INP_INFO_WUNLOCK(&V_tcbinfo);
 			headlocked = 0;
 		} else if (nam) {
 			INP_INFO_WUNLOCK(&V_tcbinfo);
 			headlocked = 0;
 		}
 		tp->snd_up = tp->snd_una + so->so_snd.sb_cc;
 		tp->t_flags |= TF_FORCEDATA;
 		error = tcp_output_send(tp);
 		tp->t_flags &= ~TF_FORCEDATA;
 	}
 out:
 	TCPDEBUG2((flags & PRUS_OOB) ? PRU_SENDOOB :
 		  ((flags & PRUS_EOF) ? PRU_SEND_EOF : PRU_SEND));
 	INP_WUNLOCK(inp);
 	if (headlocked)
 		INP_INFO_WUNLOCK(&V_tcbinfo);
 	return (error);
 }
 
 /*
  * Abort the TCP.  Drop the connection abruptly.
  */
 static void
 tcp_usr_abort(struct socket *so)
 {
 	INIT_VNET_INET(so->so_vnet);
 	struct inpcb *inp;
 	struct tcpcb *tp = NULL;
 	TCPDEBUG0;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("tcp_usr_abort: inp == NULL"));
 
 	INP_INFO_WLOCK(&V_tcbinfo);
 	INP_WLOCK(inp);
 	KASSERT(inp->inp_socket != NULL,
 	    ("tcp_usr_abort: inp_socket == NULL"));
 
 	/*
 	 * If we still have full TCP state, and we're not dropped, drop.
 	 */
 	if (!(inp->inp_vflag & INP_TIMEWAIT) &&
 	    !(inp->inp_vflag & INP_DROPPED)) {
 		tp = intotcpcb(inp);
 		TCPDEBUG1();
 		tcp_drop(tp, ECONNABORTED);
 		TCPDEBUG2(PRU_ABORT);
 	}
 	if (!(inp->inp_vflag & INP_DROPPED)) {
 		SOCK_LOCK(so);
 		so->so_state |= SS_PROTOREF;
 		SOCK_UNLOCK(so);
 		inp->inp_vflag |= INP_SOCKREF;
 	}
 	INP_WUNLOCK(inp);
 	INP_INFO_WUNLOCK(&V_tcbinfo);
 }
 
 /*
  * TCP socket is closed.  Start friendly disconnect.
  */
 static void
 tcp_usr_close(struct socket *so)
 {
 	INIT_VNET_INET(so->so_vnet);
 	struct inpcb *inp;
 	struct tcpcb *tp = NULL;
 	TCPDEBUG0;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("tcp_usr_close: inp == NULL"));
 
 	INP_INFO_WLOCK(&V_tcbinfo);
 	INP_WLOCK(inp);
 	KASSERT(inp->inp_socket != NULL,
 	    ("tcp_usr_close: inp_socket == NULL"));
 
 	/*
 	 * If we still have full TCP state, and we're not dropped, initiate
 	 * a disconnect.
 	 */
 	if (!(inp->inp_vflag & INP_TIMEWAIT) &&
 	    !(inp->inp_vflag & INP_DROPPED)) {
 		tp = intotcpcb(inp);
 		TCPDEBUG1();
 		tcp_disconnect(tp);
 		TCPDEBUG2(PRU_CLOSE);
 	}
 	if (!(inp->inp_vflag & INP_DROPPED)) {
 		SOCK_LOCK(so);
 		so->so_state |= SS_PROTOREF;
 		SOCK_UNLOCK(so);
 		inp->inp_vflag |= INP_SOCKREF;
 	}
 	INP_WUNLOCK(inp);
 	INP_INFO_WUNLOCK(&V_tcbinfo);
 }
 
 /*
  * Receive out-of-band data.
  */
 static int
 tcp_usr_rcvoob(struct socket *so, struct mbuf *m, int flags)
 {
 	int error = 0;
 	struct inpcb *inp;
 	struct tcpcb *tp = NULL;
 
 	TCPDEBUG0;
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("tcp_usr_rcvoob: inp == NULL"));
 	INP_WLOCK(inp);
 	if (inp->inp_vflag & (INP_TIMEWAIT | INP_DROPPED)) {
 		error = ECONNRESET;
 		goto out;
 	}
 	tp = intotcpcb(inp);
 	TCPDEBUG1();
 	if ((so->so_oobmark == 0 &&
 	     (so->so_rcv.sb_state & SBS_RCVATMARK) == 0) ||
 	    so->so_options & SO_OOBINLINE ||
 	    tp->t_oobflags & TCPOOB_HADDATA) {
 		error = EINVAL;
 		goto out;
 	}
 	if ((tp->t_oobflags & TCPOOB_HAVEDATA) == 0) {
 		error = EWOULDBLOCK;
 		goto out;
 	}
 	m->m_len = 1;
 	*mtod(m, caddr_t) = tp->t_iobc;
 	if ((flags & MSG_PEEK) == 0)
 		tp->t_oobflags ^= (TCPOOB_HAVEDATA | TCPOOB_HADDATA);
 
 out:
 	TCPDEBUG2(PRU_RCVOOB);
 	INP_WUNLOCK(inp);
 	return (error);
 }
 
 struct pr_usrreqs tcp_usrreqs = {
 	.pru_abort =		tcp_usr_abort,
 	.pru_accept =		tcp_usr_accept,
 	.pru_attach =		tcp_usr_attach,
 	.pru_bind =		tcp_usr_bind,
 	.pru_connect =		tcp_usr_connect,
 	.pru_control =		in_control,
 	.pru_detach =		tcp_usr_detach,
 	.pru_disconnect =	tcp_usr_disconnect,
 	.pru_listen =		tcp_usr_listen,
 	.pru_peeraddr =		in_getpeeraddr,
 	.pru_rcvd =		tcp_usr_rcvd,
 	.pru_rcvoob =		tcp_usr_rcvoob,
 	.pru_send =		tcp_usr_send,
 	.pru_shutdown =		tcp_usr_shutdown,
 	.pru_sockaddr =		in_getsockaddr,
 	.pru_sosetlabel =	in_pcbsosetlabel,
 	.pru_close =		tcp_usr_close,
 };
 
 #ifdef INET6
 struct pr_usrreqs tcp6_usrreqs = {
 	.pru_abort =		tcp_usr_abort,
 	.pru_accept =		tcp6_usr_accept,
 	.pru_attach =		tcp_usr_attach,
 	.pru_bind =		tcp6_usr_bind,
 	.pru_connect =		tcp6_usr_connect,
 	.pru_control =		in6_control,
 	.pru_detach =		tcp_usr_detach,
 	.pru_disconnect =	tcp_usr_disconnect,
 	.pru_listen =		tcp6_usr_listen,
 	.pru_peeraddr =		in6_mapped_peeraddr,
 	.pru_rcvd =		tcp_usr_rcvd,
 	.pru_rcvoob =		tcp_usr_rcvoob,
 	.pru_send =		tcp_usr_send,
 	.pru_shutdown =		tcp_usr_shutdown,
 	.pru_sockaddr =		in6_mapped_sockaddr,
  	.pru_sosetlabel =	in_pcbsosetlabel,
 	.pru_close =		tcp_usr_close,
 };
 #endif /* INET6 */
 
 /*
  * Common subroutine to open a TCP connection to remote host specified
  * by struct sockaddr_in in mbuf *nam.  Call in_pcbbind to assign a local
  * port number if needed.  Call in_pcbconnect_setup to do the routing and
  * to choose a local host address (interface).  If there is an existing
  * incarnation of the same connection in TIME-WAIT state and if the remote
  * host was sending CC options and if the connection duration was < MSL, then
  * truncate the previous TIME-WAIT state and proceed.
  * Initialize connection parameters and enter SYN-SENT state.
  */
 static int
 tcp_connect(struct tcpcb *tp, struct sockaddr *nam, struct thread *td)
 {
 	struct inpcb *inp = tp->t_inpcb, *oinp;
 	struct socket *so = inp->inp_socket;
 	INIT_VNET_INET(so->so_vnet);
 	struct in_addr laddr;
 	u_short lport;
 	int error;
 
 	INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
 	INP_WLOCK_ASSERT(inp);
 
 	if (inp->inp_lport == 0) {
 		error = in_pcbbind(inp, (struct sockaddr *)0, td->td_ucred);
 		if (error)
 			return error;
 	}
 
 	/*
 	 * Cannot simply call in_pcbconnect, because there might be an
 	 * earlier incarnation of this same connection still in
 	 * TIME_WAIT state, creating an ADDRINUSE error.
 	 */
 	laddr = inp->inp_laddr;
 	lport = inp->inp_lport;
 	error = in_pcbconnect_setup(inp, nam, &laddr.s_addr, &lport,
 	    &inp->inp_faddr.s_addr, &inp->inp_fport, &oinp, td->td_ucred);
 	if (error && oinp == NULL)
 		return error;
 	if (oinp)
 		return EADDRINUSE;
 	inp->inp_laddr = laddr;
 	in_pcbrehash(inp);
 
 	/*
 	 * Compute window scaling to request:
 	 * Scale to fit into sweet spot.  See tcp_syncache.c.
 	 * XXX: This should move to tcp_output().
 	 */
 	while (tp->request_r_scale < TCP_MAX_WINSHIFT &&
 	    (TCP_MAXWIN << tp->request_r_scale) < sb_max)
 		tp->request_r_scale++;
 
 	soisconnecting(so);
 	V_tcpstat.tcps_connattempt++;
 	tp->t_state = TCPS_SYN_SENT;
 	tcp_timer_activate(tp, TT_KEEP, tcp_keepinit);
 	tp->iss = tcp_new_isn(tp);
 	tp->t_bw_rtseq = tp->iss;
 	tcp_sendseqinit(tp);
 
 	return 0;
 }
 
 #ifdef INET6
 static int
 tcp6_connect(struct tcpcb *tp, struct sockaddr *nam, struct thread *td)
 {
 	struct inpcb *inp = tp->t_inpcb, *oinp;
 	struct socket *so = inp->inp_socket;
 	INIT_VNET_INET(so->so_vnet);
 	struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)nam;
 	struct in6_addr *addr6;
 	int error;
 
 	INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
 	INP_WLOCK_ASSERT(inp);
 
 	if (inp->inp_lport == 0) {
 		error = in6_pcbbind(inp, (struct sockaddr *)0, td->td_ucred);
 		if (error)
 			return error;
 	}
 
 	/*
 	 * Cannot simply call in_pcbconnect, because there might be an
 	 * earlier incarnation of this same connection still in
 	 * TIME_WAIT state, creating an ADDRINUSE error.
 	 * in6_pcbladdr() also handles scope zone IDs.
 	 */
 	error = in6_pcbladdr(inp, nam, &addr6);
 	if (error)
 		return error;
 	oinp = in6_pcblookup_hash(inp->inp_pcbinfo,
 				  &sin6->sin6_addr, sin6->sin6_port,
 				  IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)
 				  ? addr6
 				  : &inp->in6p_laddr,
 				  inp->inp_lport,  0, NULL);
 	if (oinp)
 		return EADDRINUSE;
 	if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr))
 		inp->in6p_laddr = *addr6;
 	inp->in6p_faddr = sin6->sin6_addr;
 	inp->inp_fport = sin6->sin6_port;
 	/* update flowinfo - draft-itojun-ipv6-flowlabel-api-00 */
 	inp->inp_flow &= ~IPV6_FLOWLABEL_MASK;
 	if (inp->inp_flags & IN6P_AUTOFLOWLABEL)
 		inp->inp_flow |=
 		    (htonl(ip6_randomflowlabel()) & IPV6_FLOWLABEL_MASK);
 	in_pcbrehash(inp);
 
 	/* Compute window scaling to request.  */
 	while (tp->request_r_scale < TCP_MAX_WINSHIFT &&
 	    (TCP_MAXWIN << tp->request_r_scale) < so->so_rcv.sb_hiwat)
 		tp->request_r_scale++;
 
 	soisconnecting(so);
 	V_tcpstat.tcps_connattempt++;
 	tp->t_state = TCPS_SYN_SENT;
 	tcp_timer_activate(tp, TT_KEEP, tcp_keepinit);
 	tp->iss = tcp_new_isn(tp);
 	tp->t_bw_rtseq = tp->iss;
 	tcp_sendseqinit(tp);
 
 	return 0;
 }
 #endif /* INET6 */
 
 /*
  * Export TCP internal state information via a struct tcp_info, based on the
  * Linux 2.6 API.  Not ABI compatible as our constants are mapped differently
  * (TCP state machine, etc).  We export all information using FreeBSD-native
  * constants -- for example, the numeric values for tcpi_state will differ
  * from Linux.
  */
 static void
 tcp_fill_info(struct tcpcb *tp, struct tcp_info *ti)
 {
 
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 	bzero(ti, sizeof(*ti));
 
 	ti->tcpi_state = tp->t_state;
 	if ((tp->t_flags & TF_REQ_TSTMP) && (tp->t_flags & TF_RCVD_TSTMP))
 		ti->tcpi_options |= TCPI_OPT_TIMESTAMPS;
 	if (tp->t_flags & TF_SACK_PERMIT)
 		ti->tcpi_options |= TCPI_OPT_SACK;
 	if ((tp->t_flags & TF_REQ_SCALE) && (tp->t_flags & TF_RCVD_SCALE)) {
 		ti->tcpi_options |= TCPI_OPT_WSCALE;
 		ti->tcpi_snd_wscale = tp->snd_scale;
 		ti->tcpi_rcv_wscale = tp->rcv_scale;
 	}
 
 	ti->tcpi_rtt = ((u_int64_t)tp->t_srtt * tick) >> TCP_RTT_SHIFT;
 	ti->tcpi_rttvar = ((u_int64_t)tp->t_rttvar * tick) >> TCP_RTTVAR_SHIFT;
 
 	ti->tcpi_snd_ssthresh = tp->snd_ssthresh;
 	ti->tcpi_snd_cwnd = tp->snd_cwnd;
 
 	/*
 	 * FreeBSD-specific extension fields for tcp_info.
 	 */
 	ti->tcpi_rcv_space = tp->rcv_wnd;
 	ti->tcpi_rcv_nxt = tp->rcv_nxt;
 	ti->tcpi_snd_wnd = tp->snd_wnd;
 	ti->tcpi_snd_bwnd = tp->snd_bwnd;
 	ti->tcpi_snd_nxt = tp->snd_nxt;
 	ti->__tcpi_snd_mss = tp->t_maxseg;
 	ti->__tcpi_rcv_mss = tp->t_maxseg;
 	if (tp->t_flags & TF_TOE)
 		ti->tcpi_options |= TCPI_OPT_TOE;
 }
 
 /*
  * tcp_ctloutput() must drop the inpcb lock before performing copyin on
  * socket option arguments.  When it re-acquires the lock after the copy, it
  * has to revalidate that the connection is still valid for the socket
  * option.
  */
 #define INP_WLOCK_RECHECK(inp) do {					\
 	INP_WLOCK(inp);							\
 	if (inp->inp_vflag & (INP_TIMEWAIT | INP_DROPPED)) {		\
 		INP_WUNLOCK(inp);					\
 		return (ECONNRESET);					\
 	}								\
 	tp = intotcpcb(inp);						\
 } while(0)
 
 int
 tcp_ctloutput(struct socket *so, struct sockopt *sopt)
 {
 	INIT_VNET_INET(so->so_vnet);
 	int	error, opt, optval;
 	struct	inpcb *inp;
 	struct	tcpcb *tp;
 	struct	tcp_info ti;
 
 	error = 0;
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("tcp_ctloutput: inp == NULL"));
 	INP_WLOCK(inp);
 	if (sopt->sopt_level != IPPROTO_TCP) {
 #ifdef INET6
 		if (inp->inp_vflag & INP_IPV6PROTO) {
 			INP_WUNLOCK(inp);
 			error = ip6_ctloutput(so, sopt);
 		} else {
 #endif /* INET6 */
 			INP_WUNLOCK(inp);
 			error = ip_ctloutput(so, sopt);
 #ifdef INET6
 		}
 #endif
 		return (error);
 	}
 	if (inp->inp_vflag & (INP_TIMEWAIT | INP_DROPPED)) {
 		INP_WUNLOCK(inp);
 		return (ECONNRESET);
 	}
 
 	switch (sopt->sopt_dir) {
 	case SOPT_SET:
 		switch (sopt->sopt_name) {
 #ifdef TCP_SIGNATURE
 		case TCP_MD5SIG:
 			INP_WUNLOCK(inp);
 			error = sooptcopyin(sopt, &optval, sizeof optval,
 			    sizeof optval);
 			if (error)
 				return (error);
 
 			INP_WLOCK_RECHECK(inp);
 			if (optval > 0)
 				tp->t_flags |= TF_SIGNATURE;
 			else
 				tp->t_flags &= ~TF_SIGNATURE;
 			INP_WUNLOCK(inp);
 			break;
 #endif /* TCP_SIGNATURE */
 		case TCP_NODELAY:
 		case TCP_NOOPT:
 			INP_WUNLOCK(inp);
 			error = sooptcopyin(sopt, &optval, sizeof optval,
 			    sizeof optval);
 			if (error)
 				return (error);
 
 			INP_WLOCK_RECHECK(inp);
 			switch (sopt->sopt_name) {
 			case TCP_NODELAY:
 				opt = TF_NODELAY;
 				break;
 			case TCP_NOOPT:
 				opt = TF_NOOPT;
 				break;
 			default:
 				opt = 0; /* dead code to fool gcc */
 				break;
 			}
 
 			if (optval)
 				tp->t_flags |= opt;
 			else
 				tp->t_flags &= ~opt;
 			INP_WUNLOCK(inp);
 			break;
 
 		case TCP_NOPUSH:
 			INP_WUNLOCK(inp);
 			error = sooptcopyin(sopt, &optval, sizeof optval,
 			    sizeof optval);
 			if (error)
 				return (error);
 
 			INP_WLOCK_RECHECK(inp);
 			if (optval)
 				tp->t_flags |= TF_NOPUSH;
 			else {
 				tp->t_flags &= ~TF_NOPUSH;
 				error = tcp_output(tp);
 			}
 			INP_WUNLOCK(inp);
 			break;
 
 		case TCP_MAXSEG:
 			INP_WUNLOCK(inp);
 			error = sooptcopyin(sopt, &optval, sizeof optval,
 			    sizeof optval);
 			if (error)
 				return (error);
 
 			INP_WLOCK_RECHECK(inp);
 			if (optval > 0 && optval <= tp->t_maxseg &&
 			    optval + 40 >= V_tcp_minmss)
 				tp->t_maxseg = optval;
 			else
 				error = EINVAL;
 			INP_WUNLOCK(inp);
 			break;
 
 		case TCP_INFO:
 			INP_WUNLOCK(inp);
 			error = EINVAL;
 			break;
 
 		default:
 			INP_WUNLOCK(inp);
 			error = ENOPROTOOPT;
 			break;
 		}
 		break;
 
 	case SOPT_GET:
 		tp = intotcpcb(inp);
 		switch (sopt->sopt_name) {
 #ifdef TCP_SIGNATURE
 		case TCP_MD5SIG:
 			optval = (tp->t_flags & TF_SIGNATURE) ? 1 : 0;
 			INP_WUNLOCK(inp);
 			error = sooptcopyout(sopt, &optval, sizeof optval);
 			break;
 #endif
 
 		case TCP_NODELAY:
 			optval = tp->t_flags & TF_NODELAY;
 			INP_WUNLOCK(inp);
 			error = sooptcopyout(sopt, &optval, sizeof optval);
 			break;
 		case TCP_MAXSEG:
 			optval = tp->t_maxseg;
 			INP_WUNLOCK(inp);
 			error = sooptcopyout(sopt, &optval, sizeof optval);
 			break;
 		case TCP_NOOPT:
 			optval = tp->t_flags & TF_NOOPT;
 			INP_WUNLOCK(inp);
 			error = sooptcopyout(sopt, &optval, sizeof optval);
 			break;
 		case TCP_NOPUSH:
 			optval = tp->t_flags & TF_NOPUSH;
 			INP_WUNLOCK(inp);
 			error = sooptcopyout(sopt, &optval, sizeof optval);
 			break;
 		case TCP_INFO:
 			tcp_fill_info(tp, &ti);
 			INP_WUNLOCK(inp);
 			error = sooptcopyout(sopt, &ti, sizeof ti);
 			break;
 		default:
 			INP_WUNLOCK(inp);
 			error = ENOPROTOOPT;
 			break;
 		}
 		break;
 	}
 	return (error);
 }
 #undef INP_WLOCK_RECHECK
 
 /*
  * tcp_sendspace and tcp_recvspace are the default send and receive window
  * sizes, respectively.  These are obsolescent (this information should
  * be set by the route).
  */
 u_long	tcp_sendspace = 1024*32;
 SYSCTL_ULONG(_net_inet_tcp, TCPCTL_SENDSPACE, sendspace, CTLFLAG_RW,
     &tcp_sendspace , 0, "Maximum outgoing TCP datagram size");
 u_long	tcp_recvspace = 1024*64;
 SYSCTL_ULONG(_net_inet_tcp, TCPCTL_RECVSPACE, recvspace, CTLFLAG_RW,
     &tcp_recvspace , 0, "Maximum incoming TCP datagram size");
 
 /*
  * Attach TCP protocol to socket, allocating
  * internet protocol control block, tcp control block,
  * bufer space, and entering LISTEN state if to accept connections.
  */
 static int
 tcp_attach(struct socket *so)
 {
 	INIT_VNET_INET(so->so_vnet);
 	struct tcpcb *tp;
 	struct inpcb *inp;
 	int error;
 
 	if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
 		error = soreserve(so, tcp_sendspace, tcp_recvspace);
 		if (error)
 			return (error);
 	}
 	so->so_rcv.sb_flags |= SB_AUTOSIZE;
 	so->so_snd.sb_flags |= SB_AUTOSIZE;
 	INP_INFO_WLOCK(&V_tcbinfo);
 	error = in_pcballoc(so, &V_tcbinfo);
 	if (error) {
 		INP_INFO_WUNLOCK(&V_tcbinfo);
 		return (error);
 	}
 	inp = sotoinpcb(so);
 #ifdef INET6
 	if (inp->inp_vflag & INP_IPV6PROTO) {
 		inp->inp_vflag |= INP_IPV6;
 		inp->in6p_hops = -1;	/* use kernel default */
 	}
 	else
 #endif
 	inp->inp_vflag |= INP_IPV4;
 	tp = tcp_newtcpcb(inp);
 	if (tp == NULL) {
 		in_pcbdetach(inp);
 		in_pcbfree(inp);
 		INP_INFO_WUNLOCK(&V_tcbinfo);
 		return (ENOBUFS);
 	}
 	tp->t_state = TCPS_CLOSED;
 	INP_WUNLOCK(inp);
 	INP_INFO_WUNLOCK(&V_tcbinfo);
 	return (0);
 }
 
 /*
  * Initiate (or continue) disconnect.
  * If embryonic state, just send reset (once).
  * If in ``let data drain'' option and linger null, just drop.
  * Otherwise (hard), mark socket disconnecting and drop
  * current input data; switch states based on user close, and
  * send segment to peer (with FIN).
  */
 static void
 tcp_disconnect(struct tcpcb *tp)
 {
 	struct inpcb *inp = tp->t_inpcb;
 	struct socket *so = inp->inp_socket;
 #ifdef INVARIANTS
 	INIT_VNET_INET(so->so_vnet);
 #endif
 
 	INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
 	INP_WLOCK_ASSERT(inp);
 
 	/*
 	 * Neither tcp_close() nor tcp_drop() should return NULL, as the
 	 * socket is still open.
 	 */
 	if (tp->t_state < TCPS_ESTABLISHED) {
 		tp = tcp_close(tp);
 		KASSERT(tp != NULL,
 		    ("tcp_disconnect: tcp_close() returned NULL"));
 	} else if ((so->so_options & SO_LINGER) && so->so_linger == 0) {
 		tp = tcp_drop(tp, 0);
 		KASSERT(tp != NULL,
 		    ("tcp_disconnect: tcp_drop() returned NULL"));
 	} else {
 		soisdisconnecting(so);
 		sbflush(&so->so_rcv);
 		tcp_usrclosed(tp);
 		if (!(inp->inp_vflag & INP_DROPPED))
 			tcp_output_disconnect(tp);
 	}
 }
 
 /*
  * User issued close, and wish to trail through shutdown states:
  * if never received SYN, just forget it.  If got a SYN from peer,
  * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN.
  * If already got a FIN from peer, then almost done; go to LAST_ACK
  * state.  In all other cases, have already sent FIN to peer (e.g.
  * after PRU_SHUTDOWN), and just have to play tedious game waiting
  * for peer to send FIN or not respond to keep-alives, etc.
  * We can let the user exit from the close as soon as the FIN is acked.
  */
 static void
 tcp_usrclosed(struct tcpcb *tp)
 {
 #ifdef INVARIANTS
 	INIT_VNET_INET(tp->t_inpcb->inp_vnet);
 #endif
 
 	INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 
 	switch (tp->t_state) {
 	case TCPS_LISTEN:
 		tcp_offload_listen_close(tp);
 		/* FALLTHROUGH */
 	case TCPS_CLOSED:
 		tp->t_state = TCPS_CLOSED;
 		tp = tcp_close(tp);
 		/*
 		 * tcp_close() should never return NULL here as the socket is
 		 * still open.
 		 */
 		KASSERT(tp != NULL,
 		    ("tcp_usrclosed: tcp_close() returned NULL"));
 		break;
 
 	case TCPS_SYN_SENT:
 	case TCPS_SYN_RECEIVED:
 		tp->t_flags |= TF_NEEDFIN;
 		break;
 
 	case TCPS_ESTABLISHED:
 		tp->t_state = TCPS_FIN_WAIT_1;
 		break;
 
 	case TCPS_CLOSE_WAIT:
 		tp->t_state = TCPS_LAST_ACK;
 		break;
 	}
 	if (tp->t_state >= TCPS_FIN_WAIT_2) {
 		soisdisconnected(tp->t_inpcb->inp_socket);
 		/* Prevent the connection hanging in FIN_WAIT_2 forever. */
 		if (tp->t_state == TCPS_FIN_WAIT_2) {
 			int timeout;
 
 			timeout = (tcp_fast_finwait2_recycle) ? 
 			    tcp_finwait2_timeout : tcp_maxidle;
 			tcp_timer_activate(tp, TT_2MSL, timeout);
 		}
 	}
 }
 
 #ifdef DDB
 static void
 db_print_indent(int indent)
 {
 	int i;
 
 	for (i = 0; i < indent; i++)
 		db_printf(" ");
 }
 
 static void
 db_print_tstate(int t_state)
 {
 
 	switch (t_state) {
 	case TCPS_CLOSED:
 		db_printf("TCPS_CLOSED");
 		return;
 
 	case TCPS_LISTEN:
 		db_printf("TCPS_LISTEN");
 		return;
 
 	case TCPS_SYN_SENT:
 		db_printf("TCPS_SYN_SENT");
 		return;
 
 	case TCPS_SYN_RECEIVED:
 		db_printf("TCPS_SYN_RECEIVED");
 		return;
 
 	case TCPS_ESTABLISHED:
 		db_printf("TCPS_ESTABLISHED");
 		return;
 
 	case TCPS_CLOSE_WAIT:
 		db_printf("TCPS_CLOSE_WAIT");
 		return;
 
 	case TCPS_FIN_WAIT_1:
 		db_printf("TCPS_FIN_WAIT_1");
 		return;
 
 	case TCPS_CLOSING:
 		db_printf("TCPS_CLOSING");
 		return;
 
 	case TCPS_LAST_ACK:
 		db_printf("TCPS_LAST_ACK");
 		return;
 
 	case TCPS_FIN_WAIT_2:
 		db_printf("TCPS_FIN_WAIT_2");
 		return;
 
 	case TCPS_TIME_WAIT:
 		db_printf("TCPS_TIME_WAIT");
 		return;
 
 	default:
 		db_printf("unknown");
 		return;
 	}
 }
 
 static void
 db_print_tflags(u_int t_flags)
 {
 	int comma;
 
 	comma = 0;
 	if (t_flags & TF_ACKNOW) {
 		db_printf("%sTF_ACKNOW", comma ? ", " : "");
 		comma = 1;
 	}
 	if (t_flags & TF_DELACK) {
 		db_printf("%sTF_DELACK", comma ? ", " : "");
 		comma = 1;
 	}
 	if (t_flags & TF_NODELAY) {
 		db_printf("%sTF_NODELAY", comma ? ", " : "");
 		comma = 1;
 	}
 	if (t_flags & TF_NOOPT) {
 		db_printf("%sTF_NOOPT", comma ? ", " : "");
 		comma = 1;
 	}
 	if (t_flags & TF_SENTFIN) {
 		db_printf("%sTF_SENTFIN", comma ? ", " : "");
 		comma = 1;
 	}
 	if (t_flags & TF_REQ_SCALE) {
 		db_printf("%sTF_REQ_SCALE", comma ? ", " : "");
 		comma = 1;
 	}
 	if (t_flags & TF_RCVD_SCALE) {
 		db_printf("%sTF_RECVD_SCALE", comma ? ", " : "");
 		comma = 1;
 	}
 	if (t_flags & TF_REQ_TSTMP) {
 		db_printf("%sTF_REQ_TSTMP", comma ? ", " : "");
 		comma = 1;
 	}
 	if (t_flags & TF_RCVD_TSTMP) {
 		db_printf("%sTF_RCVD_TSTMP", comma ? ", " : "");
 		comma = 1;
 	}
 	if (t_flags & TF_SACK_PERMIT) {
 		db_printf("%sTF_SACK_PERMIT", comma ? ", " : "");
 		comma = 1;
 	}
 	if (t_flags & TF_NEEDSYN) {
 		db_printf("%sTF_NEEDSYN", comma ? ", " : "");
 		comma = 1;
 	}
 	if (t_flags & TF_NEEDFIN) {
 		db_printf("%sTF_NEEDFIN", comma ? ", " : "");
 		comma = 1;
 	}
 	if (t_flags & TF_NOPUSH) {
 		db_printf("%sTF_NOPUSH", comma ? ", " : "");
 		comma = 1;
 	}
 	if (t_flags & TF_NOPUSH) {
 		db_printf("%sTF_NOPUSH", comma ? ", " : "");
 		comma = 1;
 	}
 	if (t_flags & TF_MORETOCOME) {
 		db_printf("%sTF_MORETOCOME", comma ? ", " : "");
 		comma = 1;
 	}
 	if (t_flags & TF_LQ_OVERFLOW) {
 		db_printf("%sTF_LQ_OVERFLOW", comma ? ", " : "");
 		comma = 1;
 	}
 	if (t_flags & TF_LASTIDLE) {
 		db_printf("%sTF_LASTIDLE", comma ? ", " : "");
 		comma = 1;
 	}
 	if (t_flags & TF_RXWIN0SENT) {
 		db_printf("%sTF_RXWIN0SENT", comma ? ", " : "");
 		comma = 1;
 	}
 	if (t_flags & TF_FASTRECOVERY) {
 		db_printf("%sTF_FASTRECOVERY", comma ? ", " : "");
 		comma = 1;
 	}
 	if (t_flags & TF_WASFRECOVERY) {
 		db_printf("%sTF_WASFRECOVERY", comma ? ", " : "");
 		comma = 1;
 	}
 	if (t_flags & TF_SIGNATURE) {
 		db_printf("%sTF_SIGNATURE", comma ? ", " : "");
 		comma = 1;
 	}
 	if (t_flags & TF_FORCEDATA) {
 		db_printf("%sTF_FORCEDATA", comma ? ", " : "");
 		comma = 1;
 	}
 	if (t_flags & TF_TSO) {
 		db_printf("%sTF_TSO", comma ? ", " : "");
 		comma = 1;
 	}
 	if (t_flags & TF_ECN_PERMIT) {
 		db_printf("%sTF_ECN_PERMIT", comma ? ", " : "");
 		comma = 1;
 	}
 }
 
 static void
 db_print_toobflags(char t_oobflags)
 {
 	int comma;
 
 	comma = 0;
 	if (t_oobflags & TCPOOB_HAVEDATA) {
 		db_printf("%sTCPOOB_HAVEDATA", comma ? ", " : "");
 		comma = 1;
 	}
 	if (t_oobflags & TCPOOB_HADDATA) {
 		db_printf("%sTCPOOB_HADDATA", comma ? ", " : "");
 		comma = 1;
 	}
 }
 
 static void
 db_print_tcpcb(struct tcpcb *tp, const char *name, int indent)
 {
 
 	db_print_indent(indent);
 	db_printf("%s at %p\n", name, tp);
 
 	indent += 2;
 
 	db_print_indent(indent);
 	db_printf("t_segq first: %p   t_segqlen: %d   t_dupacks: %d\n",
 	   LIST_FIRST(&tp->t_segq), tp->t_segqlen, tp->t_dupacks);
 
 	db_print_indent(indent);
 	db_printf("tt_rexmt: %p   tt_persist: %p   tt_keep: %p\n",
 	    &tp->t_timers->tt_rexmt, &tp->t_timers->tt_persist, &tp->t_timers->tt_keep);
 
 	db_print_indent(indent);
 	db_printf("tt_2msl: %p   tt_delack: %p   t_inpcb: %p\n", &tp->t_timers->tt_2msl,
 	    &tp->t_timers->tt_delack, tp->t_inpcb);
 
 	db_print_indent(indent);
 	db_printf("t_state: %d (", tp->t_state);
 	db_print_tstate(tp->t_state);
 	db_printf(")\n");
 
 	db_print_indent(indent);
 	db_printf("t_flags: 0x%x (", tp->t_flags);
 	db_print_tflags(tp->t_flags);
 	db_printf(")\n");
 
 	db_print_indent(indent);
 	db_printf("snd_una: 0x%08x   snd_max: 0x%08x   snd_nxt: x0%08x\n",
 	    tp->snd_una, tp->snd_max, tp->snd_nxt);
 
 	db_print_indent(indent);
 	db_printf("snd_up: 0x%08x   snd_wl1: 0x%08x   snd_wl2: 0x%08x\n",
 	   tp->snd_up, tp->snd_wl1, tp->snd_wl2);
 
 	db_print_indent(indent);
 	db_printf("iss: 0x%08x   irs: 0x%08x   rcv_nxt: 0x%08x\n",
 	    tp->iss, tp->irs, tp->rcv_nxt);
 
 	db_print_indent(indent);
 	db_printf("rcv_adv: 0x%08x   rcv_wnd: %lu   rcv_up: 0x%08x\n",
 	    tp->rcv_adv, tp->rcv_wnd, tp->rcv_up);
 
 	db_print_indent(indent);
 	db_printf("snd_wnd: %lu   snd_cwnd: %lu   snd_bwnd: %lu\n",
 	   tp->snd_wnd, tp->snd_cwnd, tp->snd_bwnd);
 
 	db_print_indent(indent);
 	db_printf("snd_ssthresh: %lu   snd_bandwidth: %lu   snd_recover: "
 	    "0x%08x\n", tp->snd_ssthresh, tp->snd_bandwidth,
 	    tp->snd_recover);
 
 	db_print_indent(indent);
 	db_printf("t_maxopd: %u   t_rcvtime: %lu   t_startime: %lu\n",
 	    tp->t_maxopd, tp->t_rcvtime, tp->t_starttime);
 
 	db_print_indent(indent);
 	db_printf("t_rttime: %d   t_rtsq: 0x%08x   t_bw_rtttime: %d\n",
 	    tp->t_rtttime, tp->t_rtseq, tp->t_bw_rtttime);
 
 	db_print_indent(indent);
 	db_printf("t_bw_rtseq: 0x%08x   t_rxtcur: %d   t_maxseg: %u   "
 	    "t_srtt: %d\n", tp->t_bw_rtseq, tp->t_rxtcur, tp->t_maxseg,
 	    tp->t_srtt);
 
 	db_print_indent(indent);
 	db_printf("t_rttvar: %d   t_rxtshift: %d   t_rttmin: %u   "
 	    "t_rttbest: %u\n", tp->t_rttvar, tp->t_rxtshift, tp->t_rttmin,
 	    tp->t_rttbest);
 
 	db_print_indent(indent);
 	db_printf("t_rttupdated: %lu   max_sndwnd: %lu   t_softerror: %d\n",
 	    tp->t_rttupdated, tp->max_sndwnd, tp->t_softerror);
 
 	db_print_indent(indent);
 	db_printf("t_oobflags: 0x%x (", tp->t_oobflags);
 	db_print_toobflags(tp->t_oobflags);
 	db_printf(")   t_iobc: 0x%02x\n", tp->t_iobc);
 
 	db_print_indent(indent);
 	db_printf("snd_scale: %u   rcv_scale: %u   request_r_scale: %u\n",
 	    tp->snd_scale, tp->rcv_scale, tp->request_r_scale);
 
 	db_print_indent(indent);
 	db_printf("ts_recent: %u   ts_recent_age: %lu\n",
 	    tp->ts_recent, tp->ts_recent_age);
 
 	db_print_indent(indent);
 	db_printf("ts_offset: %u   last_ack_sent: 0x%08x   snd_cwnd_prev: "
 	    "%lu\n", tp->ts_offset, tp->last_ack_sent, tp->snd_cwnd_prev);
 
 	db_print_indent(indent);
 	db_printf("snd_ssthresh_prev: %lu   snd_recover_prev: 0x%08x   "
 	    "t_badrxtwin: %lu\n", tp->snd_ssthresh_prev,
 	    tp->snd_recover_prev, tp->t_badrxtwin);
 
 	db_print_indent(indent);
 	db_printf("snd_numholes: %d  snd_holes first: %p\n",
 	    tp->snd_numholes, TAILQ_FIRST(&tp->snd_holes));
 
 	db_print_indent(indent);
 	db_printf("snd_fack: 0x%08x   rcv_numsacks: %d   sack_newdata: "
 	    "0x%08x\n", tp->snd_fack, tp->rcv_numsacks, tp->sack_newdata);
 
 	/* Skip sackblks, sackhint. */
 
 	db_print_indent(indent);
 	db_printf("t_rttlow: %d   rfbuf_ts: %u   rfbuf_cnt: %d\n",
 	    tp->t_rttlow, tp->rfbuf_ts, tp->rfbuf_cnt);
 }
 
 DB_SHOW_COMMAND(tcpcb, db_show_tcpcb)
 {
 	struct tcpcb *tp;
 
 	if (!have_addr) {
 		db_printf("usage: show tcpcb <addr>\n");
 		return;
 	}
 	tp = (struct tcpcb *)addr;
 
 	db_print_tcpcb(tp, "tcpcb", 0);
 }
 #endif
Index: head/sys/netinet6/icmp6.c
===================================================================
--- head/sys/netinet6/icmp6.c	(revision 186221)
+++ head/sys/netinet6/icmp6.c	(revision 186222)
@@ -1,2829 +1,2829 @@
 /*-
  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the project nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	$KAME: icmp6.c,v 1.211 2001/04/04 05:56:20 itojun Exp $
  */
 
 /*-
  * Copyright (c) 1982, 1986, 1988, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)ip_icmp.c	8.2 (Berkeley) 1/4/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_ipsec.h"
 
 #include <sys/param.h>
 #include <sys/domain.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/protosw.h>
 #include <sys/signalvar.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sx.h>
 #include <sys/syslog.h>
 #include <sys/systm.h>
 #include <sys/time.h>
 #include <sys/vimage.h>
 
 #include <net/if.h>
 #include <net/if_dl.h>
 #include <net/if_llatbl.h>
 #include <net/if_types.h>
 #include <net/route.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_pcb.h>
 #include <netinet/in_var.h>
 #include <netinet/ip6.h>
 #include <netinet/icmp6.h>
 #include <netinet/tcp_var.h>
 #include <netinet/vinet.h>
 
 #include <netinet6/in6_ifattach.h>
 #include <netinet6/in6_pcb.h>
 #include <netinet6/ip6protosw.h>
 #include <netinet6/ip6_var.h>
 #include <netinet6/scope6_var.h>
 #include <netinet6/mld6_var.h>
 #include <netinet6/nd6.h>
 #include <netinet6/vinet6.h>
 
 #ifdef IPSEC
 #include <netipsec/ipsec.h>
 #include <netipsec/key.h>
 #endif
 
 extern struct domain inet6domain;
 
 #ifdef VIMAGE_GLOBALS
 extern struct inpcbinfo ripcbinfo;
 extern struct inpcbhead ripcb;
 extern int icmp6errppslim;
 extern int icmp6_nodeinfo;
 
 struct icmp6stat icmp6stat;
 static int icmp6errpps_count;
 static struct timeval icmp6errppslim_last;
 #endif
 
 static void icmp6_errcount(struct icmp6errstat *, int, int);
 static int icmp6_rip6_input(struct mbuf **, int);
 static int icmp6_ratelimit(const struct in6_addr *, const int, const int);
 static const char *icmp6_redirect_diag __P((struct in6_addr *,
 	struct in6_addr *, struct in6_addr *));
 static struct mbuf *ni6_input(struct mbuf *, int);
 static struct mbuf *ni6_nametodns(const char *, int, int);
 static int ni6_dnsmatch(const char *, int, const char *, int);
 static int ni6_addrs __P((struct icmp6_nodeinfo *, struct mbuf *,
 			  struct ifnet **, struct in6_addr *));
 static int ni6_store_addrs __P((struct icmp6_nodeinfo *, struct icmp6_nodeinfo *,
 				struct ifnet *, int));
 static int icmp6_notify_error(struct mbuf **, int, int, int);
 
 
 void
 icmp6_init(void)
 {
 	INIT_VNET_INET6(curvnet);
 
 	V_icmp6errpps_count = 0;
 
 	mld6_init();
 }
 
 static void
 icmp6_errcount(struct icmp6errstat *stat, int type, int code)
 {
 	switch (type) {
 	case ICMP6_DST_UNREACH:
 		switch (code) {
 		case ICMP6_DST_UNREACH_NOROUTE:
 			stat->icp6errs_dst_unreach_noroute++;
 			return;
 		case ICMP6_DST_UNREACH_ADMIN:
 			stat->icp6errs_dst_unreach_admin++;
 			return;
 		case ICMP6_DST_UNREACH_BEYONDSCOPE:
 			stat->icp6errs_dst_unreach_beyondscope++;
 			return;
 		case ICMP6_DST_UNREACH_ADDR:
 			stat->icp6errs_dst_unreach_addr++;
 			return;
 		case ICMP6_DST_UNREACH_NOPORT:
 			stat->icp6errs_dst_unreach_noport++;
 			return;
 		}
 		break;
 	case ICMP6_PACKET_TOO_BIG:
 		stat->icp6errs_packet_too_big++;
 		return;
 	case ICMP6_TIME_EXCEEDED:
 		switch (code) {
 		case ICMP6_TIME_EXCEED_TRANSIT:
 			stat->icp6errs_time_exceed_transit++;
 			return;
 		case ICMP6_TIME_EXCEED_REASSEMBLY:
 			stat->icp6errs_time_exceed_reassembly++;
 			return;
 		}
 		break;
 	case ICMP6_PARAM_PROB:
 		switch (code) {
 		case ICMP6_PARAMPROB_HEADER:
 			stat->icp6errs_paramprob_header++;
 			return;
 		case ICMP6_PARAMPROB_NEXTHEADER:
 			stat->icp6errs_paramprob_nextheader++;
 			return;
 		case ICMP6_PARAMPROB_OPTION:
 			stat->icp6errs_paramprob_option++;
 			return;
 		}
 		break;
 	case ND_REDIRECT:
 		stat->icp6errs_redirect++;
 		return;
 	}
 	stat->icp6errs_unknown++;
 }
 
 /*
  * A wrapper function for icmp6_error() necessary when the erroneous packet
  * may not contain enough scope zone information.
  */
 void
 icmp6_error2(struct mbuf *m, int type, int code, int param,
     struct ifnet *ifp)
 {
 	INIT_VNET_INET6(curvnet);
 	struct ip6_hdr *ip6;
 
 	if (ifp == NULL)
 		return;
 
 #ifndef PULLDOWN_TEST
 	IP6_EXTHDR_CHECK(m, 0, sizeof(struct ip6_hdr), );
 #else
 	if (m->m_len < sizeof(struct ip6_hdr)) {
 		m = m_pullup(m, sizeof(struct ip6_hdr));
 		if (m == NULL)
 			return;
 	}
 #endif
 
 	ip6 = mtod(m, struct ip6_hdr *);
 
 	if (in6_setscope(&ip6->ip6_src, ifp, NULL) != 0)
 		return;
 	if (in6_setscope(&ip6->ip6_dst, ifp, NULL) != 0)
 		return;
 
 	icmp6_error(m, type, code, param);
 }
 
 /*
  * Generate an error packet of type error in response to bad IP6 packet.
  */
 void
 icmp6_error(struct mbuf *m, int type, int code, int param)
 {
 	INIT_VNET_INET6(curvnet);
 	struct ip6_hdr *oip6, *nip6;
 	struct icmp6_hdr *icmp6;
 	u_int preplen;
 	int off;
 	int nxt;
 
 	V_icmp6stat.icp6s_error++;
 
 	/* count per-type-code statistics */
 	icmp6_errcount(&V_icmp6stat.icp6s_outerrhist, type, code);
 
 #ifdef M_DECRYPTED	/*not openbsd*/
 	if (m->m_flags & M_DECRYPTED) {
 		V_icmp6stat.icp6s_canterror++;
 		goto freeit;
 	}
 #endif
 
 #ifndef PULLDOWN_TEST
 	IP6_EXTHDR_CHECK(m, 0, sizeof(struct ip6_hdr), );
 #else
 	if (m->m_len < sizeof(struct ip6_hdr)) {
 		m = m_pullup(m, sizeof(struct ip6_hdr));
 		if (m == NULL)
 			return;
 	}
 #endif
 	oip6 = mtod(m, struct ip6_hdr *);
 
 	/*
 	 * If the destination address of the erroneous packet is a multicast
 	 * address, or the packet was sent using link-layer multicast,
 	 * we should basically suppress sending an error (RFC 2463, Section
 	 * 2.4).
 	 * We have two exceptions (the item e.2 in that section):
 	 * - the Pakcet Too Big message can be sent for path MTU discovery.
 	 * - the Parameter Problem Message that can be allowed an icmp6 error
 	 *   in the option type field.  This check has been done in
 	 *   ip6_unknown_opt(), so we can just check the type and code.
 	 */
 	if ((m->m_flags & (M_BCAST|M_MCAST) ||
 	     IN6_IS_ADDR_MULTICAST(&oip6->ip6_dst)) &&
 	    (type != ICMP6_PACKET_TOO_BIG &&
 	     (type != ICMP6_PARAM_PROB ||
 	      code != ICMP6_PARAMPROB_OPTION)))
 		goto freeit;
 
 	/*
 	 * RFC 2463, 2.4 (e.5): source address check.
 	 * XXX: the case of anycast source?
 	 */
 	if (IN6_IS_ADDR_UNSPECIFIED(&oip6->ip6_src) ||
 	    IN6_IS_ADDR_MULTICAST(&oip6->ip6_src))
 		goto freeit;
 
 	/*
 	 * If we are about to send ICMPv6 against ICMPv6 error/redirect,
 	 * don't do it.
 	 */
 	nxt = -1;
 	off = ip6_lasthdr(m, 0, IPPROTO_IPV6, &nxt);
 	if (off >= 0 && nxt == IPPROTO_ICMPV6) {
 		struct icmp6_hdr *icp;
 
 #ifndef PULLDOWN_TEST
 		IP6_EXTHDR_CHECK(m, 0, off + sizeof(struct icmp6_hdr), );
 		icp = (struct icmp6_hdr *)(mtod(m, caddr_t) + off);
 #else
 		IP6_EXTHDR_GET(icp, struct icmp6_hdr *, m, off,
 			sizeof(*icp));
 		if (icp == NULL) {
 			V_icmp6stat.icp6s_tooshort++;
 			return;
 		}
 #endif
 		if (icp->icmp6_type < ICMP6_ECHO_REQUEST ||
 		    icp->icmp6_type == ND_REDIRECT) {
 			/*
 			 * ICMPv6 error
 			 * Special case: for redirect (which is
 			 * informational) we must not send icmp6 error.
 			 */
 			V_icmp6stat.icp6s_canterror++;
 			goto freeit;
 		} else {
 			/* ICMPv6 informational - send the error */
 		}
 	} else {
 		/* non-ICMPv6 - send the error */
 	}
 
 	oip6 = mtod(m, struct ip6_hdr *); /* adjust pointer */
 
 	/* Finally, do rate limitation check. */
 	if (icmp6_ratelimit(&oip6->ip6_src, type, code)) {
 		V_icmp6stat.icp6s_toofreq++;
 		goto freeit;
 	}
 
 	/*
 	 * OK, ICMP6 can be generated.
 	 */
 
 	if (m->m_pkthdr.len >= ICMPV6_PLD_MAXLEN)
 		m_adj(m, ICMPV6_PLD_MAXLEN - m->m_pkthdr.len);
 
 	preplen = sizeof(struct ip6_hdr) + sizeof(struct icmp6_hdr);
 	M_PREPEND(m, preplen, M_DONTWAIT);
 	if (m && m->m_len < preplen)
 		m = m_pullup(m, preplen);
 	if (m == NULL) {
 		nd6log((LOG_DEBUG, "ENOBUFS in icmp6_error %d\n", __LINE__));
 		return;
 	}
 
 	nip6 = mtod(m, struct ip6_hdr *);
 	nip6->ip6_src  = oip6->ip6_src;
 	nip6->ip6_dst  = oip6->ip6_dst;
 
 	in6_clearscope(&oip6->ip6_src);
 	in6_clearscope(&oip6->ip6_dst);
 
 	icmp6 = (struct icmp6_hdr *)(nip6 + 1);
 	icmp6->icmp6_type = type;
 	icmp6->icmp6_code = code;
 	icmp6->icmp6_pptr = htonl((u_int32_t)param);
 
 	/*
 	 * icmp6_reflect() is designed to be in the input path.
 	 * icmp6_error() can be called from both input and output path,
 	 * and if we are in output path rcvif could contain bogus value.
 	 * clear m->m_pkthdr.rcvif for safety, we should have enough scope
 	 * information in ip header (nip6).
 	 */
 	m->m_pkthdr.rcvif = NULL;
 
 	V_icmp6stat.icp6s_outhist[type]++;
 	icmp6_reflect(m, sizeof(struct ip6_hdr)); /* header order: IPv6 - ICMPv6 */
 
 	return;
 
   freeit:
 	/*
 	 * If we can't tell whether or not we can generate ICMP6, free it.
 	 */
 	m_freem(m);
 }
 
 /*
  * Process a received ICMP6 message.
  */
 int
 icmp6_input(struct mbuf **mp, int *offp, int proto)
 {
 	INIT_VNET_INET6(curvnet);
 	INIT_VPROCG(TD_TO_VPROCG(curthread)); /* XXX V_hostname needs this */
 	struct mbuf *m = *mp, *n;
 	struct ip6_hdr *ip6, *nip6;
 	struct icmp6_hdr *icmp6, *nicmp6;
 	int off = *offp;
 	int icmp6len = m->m_pkthdr.len - *offp;
 	int code, sum, noff;
 	char ip6bufs[INET6_ADDRSTRLEN], ip6bufd[INET6_ADDRSTRLEN];
 
 #ifndef PULLDOWN_TEST
 	IP6_EXTHDR_CHECK(m, off, sizeof(struct icmp6_hdr), IPPROTO_DONE);
 	/* m might change if M_LOOP.  So, call mtod after this */
 #endif
 
 	/*
 	 * Locate icmp6 structure in mbuf, and check
 	 * that not corrupted and of at least minimum length
 	 */
 
 	ip6 = mtod(m, struct ip6_hdr *);
 	if (icmp6len < sizeof(struct icmp6_hdr)) {
 		V_icmp6stat.icp6s_tooshort++;
 		goto freeit;
 	}
 
 	/*
 	 * calculate the checksum
 	 */
 #ifndef PULLDOWN_TEST
 	icmp6 = (struct icmp6_hdr *)((caddr_t)ip6 + off);
 #else
 	IP6_EXTHDR_GET(icmp6, struct icmp6_hdr *, m, off, sizeof(*icmp6));
 	if (icmp6 == NULL) {
 		V_icmp6stat.icp6s_tooshort++;
 		return IPPROTO_DONE;
 	}
 #endif
 	code = icmp6->icmp6_code;
 
 	if ((sum = in6_cksum(m, IPPROTO_ICMPV6, off, icmp6len)) != 0) {
 		nd6log((LOG_ERR,
 		    "ICMP6 checksum error(%d|%x) %s\n",
 		    icmp6->icmp6_type, sum,
 		    ip6_sprintf(ip6bufs, &ip6->ip6_src)));
 		V_icmp6stat.icp6s_checksum++;
 		goto freeit;
 	}
 
 	if (faithprefix_p != NULL && (*faithprefix_p)(&ip6->ip6_dst)) {
 		/*
 		 * Deliver very specific ICMP6 type only.
 		 * This is important to deliver TOOBIG.  Otherwise PMTUD
 		 * will not work.
 		 */
 		switch (icmp6->icmp6_type) {
 		case ICMP6_DST_UNREACH:
 		case ICMP6_PACKET_TOO_BIG:
 		case ICMP6_TIME_EXCEEDED:
 			break;
 		default:
 			goto freeit;
 		}
 	}
 
 	V_icmp6stat.icp6s_inhist[icmp6->icmp6_type]++;
 	icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_msg);
 	if (icmp6->icmp6_type < ICMP6_INFOMSG_MASK)
 		icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_error);
 
 	switch (icmp6->icmp6_type) {
 	case ICMP6_DST_UNREACH:
 		icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_dstunreach);
 		switch (code) {
 		case ICMP6_DST_UNREACH_NOROUTE:
 			code = PRC_UNREACH_NET;
 			break;
 		case ICMP6_DST_UNREACH_ADMIN:
 			icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_adminprohib);
 			code = PRC_UNREACH_PROTOCOL; /* is this a good code? */
 			break;
 		case ICMP6_DST_UNREACH_ADDR:
 			code = PRC_HOSTDEAD;
 			break;
 		case ICMP6_DST_UNREACH_BEYONDSCOPE:
 			/* I mean "source address was incorrect." */
 			code = PRC_PARAMPROB;
 			break;
 		case ICMP6_DST_UNREACH_NOPORT:
 			code = PRC_UNREACH_PORT;
 			break;
 		default:
 			goto badcode;
 		}
 		goto deliver;
 		break;
 
 	case ICMP6_PACKET_TOO_BIG:
 		icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_pkttoobig);
 
 		/* validation is made in icmp6_mtudisc_update */
 
 		code = PRC_MSGSIZE;
 
 		/*
 		 * Updating the path MTU will be done after examining
 		 * intermediate extension headers.
 		 */
 		goto deliver;
 		break;
 
 	case ICMP6_TIME_EXCEEDED:
 		icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_timeexceed);
 		switch (code) {
 		case ICMP6_TIME_EXCEED_TRANSIT:
 			code = PRC_TIMXCEED_INTRANS;
 			break;
 		case ICMP6_TIME_EXCEED_REASSEMBLY:
 			code = PRC_TIMXCEED_REASS;
 			break;
 		default:
 			goto badcode;
 		}
 		goto deliver;
 		break;
 
 	case ICMP6_PARAM_PROB:
 		icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_paramprob);
 		switch (code) {
 		case ICMP6_PARAMPROB_NEXTHEADER:
 			code = PRC_UNREACH_PROTOCOL;
 			break;
 		case ICMP6_PARAMPROB_HEADER:
 		case ICMP6_PARAMPROB_OPTION:
 			code = PRC_PARAMPROB;
 			break;
 		default:
 			goto badcode;
 		}
 		goto deliver;
 		break;
 
 	case ICMP6_ECHO_REQUEST:
 		icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_echo);
 		if (code != 0)
 			goto badcode;
 		if ((n = m_copy(m, 0, M_COPYALL)) == NULL) {
 			/* Give up remote */
 			break;
 		}
 		if ((n->m_flags & M_EXT) != 0
 		 || n->m_len < off + sizeof(struct icmp6_hdr)) {
 			struct mbuf *n0 = n;
 			const int maxlen = sizeof(*nip6) + sizeof(*nicmp6);
 			int n0len;
 
 			MGETHDR(n, M_DONTWAIT, n0->m_type);
 			n0len = n0->m_pkthdr.len;	/* save for use below */
 			if (n)
 				M_MOVE_PKTHDR(n, n0);
 			if (n && maxlen >= MHLEN) {
 				MCLGET(n, M_DONTWAIT);
 				if ((n->m_flags & M_EXT) == 0) {
 					m_free(n);
 					n = NULL;
 				}
 			}
 			if (n == NULL) {
 				/* Give up remote */
 				m_freem(n0);
 				break;
 			}
 			/*
 			 * Copy IPv6 and ICMPv6 only.
 			 */
 			nip6 = mtod(n, struct ip6_hdr *);
 			bcopy(ip6, nip6, sizeof(struct ip6_hdr));
 			nicmp6 = (struct icmp6_hdr *)(nip6 + 1);
 			bcopy(icmp6, nicmp6, sizeof(struct icmp6_hdr));
 			noff = sizeof(struct ip6_hdr);
 			/* new mbuf contains only ipv6+icmpv6 headers */
 			n->m_len = noff + sizeof(struct icmp6_hdr);
 			/*
 			 * Adjust mbuf.  ip6_plen will be adjusted in
 			 * ip6_output().
 			 */
 			m_adj(n0, off + sizeof(struct icmp6_hdr));
 			/* recalculate complete packet size */
 			n->m_pkthdr.len = n0len + (noff - off);
 			n->m_next = n0;
 		} else {
 			nip6 = mtod(n, struct ip6_hdr *);
 			IP6_EXTHDR_GET(nicmp6, struct icmp6_hdr *, n, off,
 			    sizeof(*nicmp6));
 			noff = off;
 		}
 		nicmp6->icmp6_type = ICMP6_ECHO_REPLY;
 		nicmp6->icmp6_code = 0;
 		if (n) {
 			V_icmp6stat.icp6s_reflect++;
 			V_icmp6stat.icp6s_outhist[ICMP6_ECHO_REPLY]++;
 			icmp6_reflect(n, noff);
 		}
 		break;
 
 	case ICMP6_ECHO_REPLY:
 		icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_echoreply);
 		if (code != 0)
 			goto badcode;
 		break;
 
 	case MLD_LISTENER_QUERY:
 	case MLD_LISTENER_REPORT:
 		if (icmp6len < sizeof(struct mld_hdr))
 			goto badlen;
 		if (icmp6->icmp6_type == MLD_LISTENER_QUERY) /* XXX: ugly... */
 			icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_mldquery);
 		else
 			icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_mldreport);
 		if ((n = m_copym(m, 0, M_COPYALL, M_DONTWAIT)) == NULL) {
 			/* give up local */
 			mld6_input(m, off);
 			m = NULL;
 			goto freeit;
 		}
 		mld6_input(n, off);
 		/* m stays. */
 		break;
 
 	case MLD_LISTENER_DONE:
 		icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_mlddone);
 		if (icmp6len < sizeof(struct mld_hdr))	/* necessary? */
 			goto badlen;
 		break;		/* nothing to be done in kernel */
 
 	case MLD_MTRACE_RESP:
 	case MLD_MTRACE:
 		/* XXX: these two are experimental.  not officially defined. */
 		/* XXX: per-interface statistics? */
 		break;		/* just pass it to applications */
 
 	case ICMP6_WRUREQUEST:	/* ICMP6_FQDN_QUERY */
 	    {
 		enum { WRU, FQDN } mode;
 
 		if (!V_icmp6_nodeinfo)
 			break;
 
 		if (icmp6len == sizeof(struct icmp6_hdr) + 4)
 			mode = WRU;
 		else if (icmp6len >= sizeof(struct icmp6_nodeinfo))
 			mode = FQDN;
 		else
 			goto badlen;
 
 #define hostnamelen	strlen(V_hostname)
 		if (mode == FQDN) {
 #ifndef PULLDOWN_TEST
 			IP6_EXTHDR_CHECK(m, off, sizeof(struct icmp6_nodeinfo),
 			    IPPROTO_DONE);
 #endif
 			n = m_copy(m, 0, M_COPYALL);
 			if (n)
 				n = ni6_input(n, off);
 			/* XXX meaningless if n == NULL */
 			noff = sizeof(struct ip6_hdr);
 		} else {
 			u_char *p;
 			int maxlen, maxhlen;
 
 			/*
 			 * XXX: this combination of flags is pointless,
 			 * but should we keep this for compatibility?
 			 */
 			if ((V_icmp6_nodeinfo & 5) != 5)
 				break;
 
 			if (code != 0)
 				goto badcode;
 			maxlen = sizeof(*nip6) + sizeof(*nicmp6) + 4;
 			if (maxlen >= MCLBYTES) {
 				/* Give up remote */
 				break;
 			}
 			MGETHDR(n, M_DONTWAIT, m->m_type);
 			if (n && maxlen > MHLEN) {
 				MCLGET(n, M_DONTWAIT);
 				if ((n->m_flags & M_EXT) == 0) {
 					m_free(n);
 					n = NULL;
 				}
 			}
 			if (n && !m_dup_pkthdr(n, m, M_DONTWAIT)) {
 				/*
 				 * Previous code did a blind M_COPY_PKTHDR
 				 * and said "just for rcvif".  If true, then
 				 * we could tolerate the dup failing (due to
 				 * the deep copy of the tag chain).  For now
 				 * be conservative and just fail.
 				 */
 				m_free(n);
 				n = NULL;
 			}
 			if (n == NULL) {
 				/* Give up remote */
 				break;
 			}
 			n->m_pkthdr.rcvif = NULL;
 			n->m_len = 0;
 			maxhlen = M_TRAILINGSPACE(n) - maxlen;
 			mtx_lock(&hostname_mtx);
 			if (maxhlen > hostnamelen)
 				maxhlen = hostnamelen;
 			/*
 			 * Copy IPv6 and ICMPv6 only.
 			 */
 			nip6 = mtod(n, struct ip6_hdr *);
 			bcopy(ip6, nip6, sizeof(struct ip6_hdr));
 			nicmp6 = (struct icmp6_hdr *)(nip6 + 1);
 			bcopy(icmp6, nicmp6, sizeof(struct icmp6_hdr));
 			p = (u_char *)(nicmp6 + 1);
 			bzero(p, 4);
 			bcopy(V_hostname, p + 4, maxhlen); /* meaningless TTL */
 			mtx_unlock(&hostname_mtx);
 			noff = sizeof(struct ip6_hdr);
 			n->m_pkthdr.len = n->m_len = sizeof(struct ip6_hdr) +
 				sizeof(struct icmp6_hdr) + 4 + maxhlen;
 			nicmp6->icmp6_type = ICMP6_WRUREPLY;
 			nicmp6->icmp6_code = 0;
 		}
 #undef hostnamelen
 		if (n) {
 			V_icmp6stat.icp6s_reflect++;
 			V_icmp6stat.icp6s_outhist[ICMP6_WRUREPLY]++;
 			icmp6_reflect(n, noff);
 		}
 		break;
 	    }
 
 	case ICMP6_WRUREPLY:
 		if (code != 0)
 			goto badcode;
 		break;
 
 	case ND_ROUTER_SOLICIT:
 		icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_routersolicit);
 		if (code != 0)
 			goto badcode;
 		if (icmp6len < sizeof(struct nd_router_solicit))
 			goto badlen;
 		if ((n = m_copym(m, 0, M_COPYALL, M_DONTWAIT)) == NULL) {
 			/* give up local */
 			nd6_rs_input(m, off, icmp6len);
 			m = NULL;
 			goto freeit;
 		}
 		nd6_rs_input(n, off, icmp6len);
 		/* m stays. */
 		break;
 
 	case ND_ROUTER_ADVERT:
 		icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_routeradvert);
 		if (code != 0)
 			goto badcode;
 		if (icmp6len < sizeof(struct nd_router_advert))
 			goto badlen;
 		if ((n = m_copym(m, 0, M_COPYALL, M_DONTWAIT)) == NULL) {
 			/* give up local */
 			nd6_ra_input(m, off, icmp6len);
 			m = NULL;
 			goto freeit;
 		}
 		nd6_ra_input(n, off, icmp6len);
 		/* m stays. */
 		break;
 
 	case ND_NEIGHBOR_SOLICIT:
 		icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_neighborsolicit);
 		if (code != 0)
 			goto badcode;
 		if (icmp6len < sizeof(struct nd_neighbor_solicit))
 			goto badlen;
 		if ((n = m_copym(m, 0, M_COPYALL, M_DONTWAIT)) == NULL) {
 			/* give up local */
 			nd6_ns_input(m, off, icmp6len);
 			m = NULL;
 			goto freeit;
 		}
 		nd6_ns_input(n, off, icmp6len);
 		/* m stays. */
 		break;
 
 	case ND_NEIGHBOR_ADVERT:
 		icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_neighboradvert);
 		if (code != 0)
 			goto badcode;
 		if (icmp6len < sizeof(struct nd_neighbor_advert))
 			goto badlen;
 		if ((n = m_copym(m, 0, M_COPYALL, M_DONTWAIT)) == NULL) {
 			/* give up local */
 			nd6_na_input(m, off, icmp6len);
 			m = NULL;
 			goto freeit;
 		}
 		nd6_na_input(n, off, icmp6len);
 		/* m stays. */
 		break;
 
 	case ND_REDIRECT:
 		icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_redirect);
 		if (code != 0)
 			goto badcode;
 		if (icmp6len < sizeof(struct nd_redirect))
 			goto badlen;
 		if ((n = m_copym(m, 0, M_COPYALL, M_DONTWAIT)) == NULL) {
 			/* give up local */
 			icmp6_redirect_input(m, off);
 			m = NULL;
 			goto freeit;
 		}
 		icmp6_redirect_input(n, off);
 		/* m stays. */
 		break;
 
 	case ICMP6_ROUTER_RENUMBERING:
 		if (code != ICMP6_ROUTER_RENUMBERING_COMMAND &&
 		    code != ICMP6_ROUTER_RENUMBERING_RESULT)
 			goto badcode;
 		if (icmp6len < sizeof(struct icmp6_router_renum))
 			goto badlen;
 		break;
 
 	default:
 		nd6log((LOG_DEBUG,
 		    "icmp6_input: unknown type %d(src=%s, dst=%s, ifid=%d)\n",
 		    icmp6->icmp6_type, ip6_sprintf(ip6bufs, &ip6->ip6_src),
 		    ip6_sprintf(ip6bufd, &ip6->ip6_dst),
 		    m->m_pkthdr.rcvif ? m->m_pkthdr.rcvif->if_index : 0));
 		if (icmp6->icmp6_type < ICMP6_ECHO_REQUEST) {
 			/* ICMPv6 error: MUST deliver it by spec... */
 			code = PRC_NCMDS;
 			/* deliver */
 		} else {
 			/* ICMPv6 informational: MUST not deliver */
 			break;
 		}
 	deliver:
 		if (icmp6_notify_error(&m, off, icmp6len, code)) {
 			/* In this case, m should've been freed. */
 			return (IPPROTO_DONE);
 		}
 		break;
 
 	badcode:
 		V_icmp6stat.icp6s_badcode++;
 		break;
 
 	badlen:
 		V_icmp6stat.icp6s_badlen++;
 		break;
 	}
 
 	/* deliver the packet to appropriate sockets */
 	icmp6_rip6_input(&m, *offp);
 
 	return IPPROTO_DONE;
 
  freeit:
 	m_freem(m);
 	return IPPROTO_DONE;
 }
 
 static int
 icmp6_notify_error(struct mbuf **mp, int off, int icmp6len, int code)
 {
 	INIT_VNET_INET6(curvnet);
 	struct mbuf *m = *mp;
 	struct icmp6_hdr *icmp6;
 	struct ip6_hdr *eip6;
 	u_int32_t notifymtu;
 	struct sockaddr_in6 icmp6src, icmp6dst;
 
 	if (icmp6len < sizeof(struct icmp6_hdr) + sizeof(struct ip6_hdr)) {
 		V_icmp6stat.icp6s_tooshort++;
 		goto freeit;
 	}
 #ifndef PULLDOWN_TEST
 	IP6_EXTHDR_CHECK(m, off,
 	    sizeof(struct icmp6_hdr) + sizeof(struct ip6_hdr), -1);
 	icmp6 = (struct icmp6_hdr *)(mtod(m, caddr_t) + off);
 #else
 	IP6_EXTHDR_GET(icmp6, struct icmp6_hdr *, m, off,
 	    sizeof(*icmp6) + sizeof(struct ip6_hdr));
 	if (icmp6 == NULL) {
 		V_icmp6stat.icp6s_tooshort++;
 		return (-1);
 	}
 #endif
 	eip6 = (struct ip6_hdr *)(icmp6 + 1);
 
 	/* Detect the upper level protocol */
 	{
 		void (*ctlfunc)(int, struct sockaddr *, void *);
 		u_int8_t nxt = eip6->ip6_nxt;
 		int eoff = off + sizeof(struct icmp6_hdr) +
 		    sizeof(struct ip6_hdr);
 		struct ip6ctlparam ip6cp;
 		struct in6_addr *finaldst = NULL;
 		int icmp6type = icmp6->icmp6_type;
 		struct ip6_frag *fh;
 		struct ip6_rthdr *rth;
 		struct ip6_rthdr0 *rth0;
 		int rthlen;
 
 		while (1) { /* XXX: should avoid infinite loop explicitly? */
 			struct ip6_ext *eh;
 
 			switch (nxt) {
 			case IPPROTO_HOPOPTS:
 			case IPPROTO_DSTOPTS:
 			case IPPROTO_AH:
 #ifndef PULLDOWN_TEST
 				IP6_EXTHDR_CHECK(m, 0,
 				    eoff + sizeof(struct ip6_ext), -1);
 				eh = (struct ip6_ext *)(mtod(m, caddr_t) + eoff);
 #else
 				IP6_EXTHDR_GET(eh, struct ip6_ext *, m,
 				    eoff, sizeof(*eh));
 				if (eh == NULL) {
 					V_icmp6stat.icp6s_tooshort++;
 					return (-1);
 				}
 #endif
 
 				if (nxt == IPPROTO_AH)
 					eoff += (eh->ip6e_len + 2) << 2;
 				else
 					eoff += (eh->ip6e_len + 1) << 3;
 				nxt = eh->ip6e_nxt;
 				break;
 			case IPPROTO_ROUTING:
 				/*
 				 * When the erroneous packet contains a
 				 * routing header, we should examine the
 				 * header to determine the final destination.
 				 * Otherwise, we can't properly update
 				 * information that depends on the final
 				 * destination (e.g. path MTU).
 				 */
 #ifndef PULLDOWN_TEST
 				IP6_EXTHDR_CHECK(m, 0, eoff + sizeof(*rth), -1);
 				rth = (struct ip6_rthdr *)
 				    (mtod(m, caddr_t) + eoff);
 #else
 				IP6_EXTHDR_GET(rth, struct ip6_rthdr *, m,
 				    eoff, sizeof(*rth));
 				if (rth == NULL) {
 					V_icmp6stat.icp6s_tooshort++;
 					return (-1);
 				}
 #endif
 				rthlen = (rth->ip6r_len + 1) << 3;
 				/*
 				 * XXX: currently there is no
 				 * officially defined type other
 				 * than type-0.
 				 * Note that if the segment left field
 				 * is 0, all intermediate hops must
 				 * have been passed.
 				 */
 				if (rth->ip6r_segleft &&
 				    rth->ip6r_type == IPV6_RTHDR_TYPE_0) {
 					int hops;
 
 #ifndef PULLDOWN_TEST
 					IP6_EXTHDR_CHECK(m, 0, eoff + rthlen, -1);
 					rth0 = (struct ip6_rthdr0 *)
 					    (mtod(m, caddr_t) + eoff);
 #else
 					IP6_EXTHDR_GET(rth0,
 					    struct ip6_rthdr0 *, m,
 					    eoff, rthlen);
 					if (rth0 == NULL) {
 						V_icmp6stat.icp6s_tooshort++;
 						return (-1);
 					}
 #endif
 					/* just ignore a bogus header */
 					if ((rth0->ip6r0_len % 2) == 0 &&
 					    (hops = rth0->ip6r0_len/2))
 						finaldst = (struct in6_addr *)(rth0 + 1) + (hops - 1);
 				}
 				eoff += rthlen;
 				nxt = rth->ip6r_nxt;
 				break;
 			case IPPROTO_FRAGMENT:
 #ifndef PULLDOWN_TEST
 				IP6_EXTHDR_CHECK(m, 0, eoff +
 				    sizeof(struct ip6_frag), -1);
 				fh = (struct ip6_frag *)(mtod(m, caddr_t) +
 				    eoff);
 #else
 				IP6_EXTHDR_GET(fh, struct ip6_frag *, m,
 				    eoff, sizeof(*fh));
 				if (fh == NULL) {
 					V_icmp6stat.icp6s_tooshort++;
 					return (-1);
 				}
 #endif
 				/*
 				 * Data after a fragment header is meaningless
 				 * unless it is the first fragment, but
 				 * we'll go to the notify label for path MTU
 				 * discovery.
 				 */
 				if (fh->ip6f_offlg & IP6F_OFF_MASK)
 					goto notify;
 
 				eoff += sizeof(struct ip6_frag);
 				nxt = fh->ip6f_nxt;
 				break;
 			default:
 				/*
 				 * This case includes ESP and the No Next
 				 * Header.  In such cases going to the notify
 				 * label does not have any meaning
 				 * (i.e. ctlfunc will be NULL), but we go
 				 * anyway since we might have to update
 				 * path MTU information.
 				 */
 				goto notify;
 			}
 		}
 	  notify:
 #ifndef PULLDOWN_TEST
 		icmp6 = (struct icmp6_hdr *)(mtod(m, caddr_t) + off);
 #else
 		IP6_EXTHDR_GET(icmp6, struct icmp6_hdr *, m, off,
 		    sizeof(*icmp6) + sizeof(struct ip6_hdr));
 		if (icmp6 == NULL) {
 			V_icmp6stat.icp6s_tooshort++;
 			return (-1);
 		}
 #endif
 
 		/*
 		 * retrieve parameters from the inner IPv6 header, and convert
 		 * them into sockaddr structures.
 		 * XXX: there is no guarantee that the source or destination
 		 * addresses of the inner packet are in the same scope as
 		 * the addresses of the icmp packet.  But there is no other
 		 * way to determine the zone.
 		 */
 		eip6 = (struct ip6_hdr *)(icmp6 + 1);
 
 		bzero(&icmp6dst, sizeof(icmp6dst));
 		icmp6dst.sin6_len = sizeof(struct sockaddr_in6);
 		icmp6dst.sin6_family = AF_INET6;
 		if (finaldst == NULL)
 			icmp6dst.sin6_addr = eip6->ip6_dst;
 		else
 			icmp6dst.sin6_addr = *finaldst;
 		if (in6_setscope(&icmp6dst.sin6_addr, m->m_pkthdr.rcvif, NULL))
 			goto freeit;
 		bzero(&icmp6src, sizeof(icmp6src));
 		icmp6src.sin6_len = sizeof(struct sockaddr_in6);
 		icmp6src.sin6_family = AF_INET6;
 		icmp6src.sin6_addr = eip6->ip6_src;
 		if (in6_setscope(&icmp6src.sin6_addr, m->m_pkthdr.rcvif, NULL))
 			goto freeit;
 		icmp6src.sin6_flowinfo =
 		    (eip6->ip6_flow & IPV6_FLOWLABEL_MASK);
 
 		if (finaldst == NULL)
 			finaldst = &eip6->ip6_dst;
 		ip6cp.ip6c_m = m;
 		ip6cp.ip6c_icmp6 = icmp6;
 		ip6cp.ip6c_ip6 = (struct ip6_hdr *)(icmp6 + 1);
 		ip6cp.ip6c_off = eoff;
 		ip6cp.ip6c_finaldst = finaldst;
 		ip6cp.ip6c_src = &icmp6src;
 		ip6cp.ip6c_nxt = nxt;
 
 		if (icmp6type == ICMP6_PACKET_TOO_BIG) {
 			notifymtu = ntohl(icmp6->icmp6_mtu);
 			ip6cp.ip6c_cmdarg = (void *)&notifymtu;
 			icmp6_mtudisc_update(&ip6cp, 1);	/*XXX*/
 		}
 
 		ctlfunc = (void (*)(int, struct sockaddr *, void *))
 		    (inet6sw[ip6_protox[nxt]].pr_ctlinput);
 		if (ctlfunc) {
 			(void) (*ctlfunc)(code, (struct sockaddr *)&icmp6dst,
 			    &ip6cp);
 		}
 	}
 	*mp = m;
 	return (0);
 
   freeit:
 	m_freem(m);
 	return (-1);
 }
 
 void
 icmp6_mtudisc_update(struct ip6ctlparam *ip6cp, int validated)
 {
 	INIT_VNET_INET6(curvnet);
 	struct in6_addr *dst = ip6cp->ip6c_finaldst;
 	struct icmp6_hdr *icmp6 = ip6cp->ip6c_icmp6;
 	struct mbuf *m = ip6cp->ip6c_m;	/* will be necessary for scope issue */
 	u_int mtu = ntohl(icmp6->icmp6_mtu);
 	struct in_conninfo inc;
 
 #if 0
 	/*
 	 * RFC2460 section 5, last paragraph.
 	 * even though minimum link MTU for IPv6 is IPV6_MMTU,
 	 * we may see ICMPv6 too big with mtu < IPV6_MMTU
 	 * due to packet translator in the middle.
 	 * see ip6_output() and ip6_getpmtu() "alwaysfrag" case for
 	 * special handling.
 	 */
 	if (mtu < IPV6_MMTU)
 		return;
 #endif
 
 	/*
 	 * we reject ICMPv6 too big with abnormally small value.
 	 * XXX what is the good definition of "abnormally small"?
 	 */
 	if (mtu < sizeof(struct ip6_hdr) + sizeof(struct ip6_frag) + 8)
 		return;
 
 	if (!validated)
 		return;
 
 	/*
 	 * In case the suggested mtu is less than IPV6_MMTU, we
 	 * only need to remember that it was for above mentioned
 	 * "alwaysfrag" case.
 	 * Try to be as close to the spec as possible.
 	 */
 	if (mtu < IPV6_MMTU)
 		mtu = IPV6_MMTU - 8;
 
 	bzero(&inc, sizeof(inc));
-	inc.inc_flags = 1; /* IPv6 */
+	inc.inc_flags |= INC_ISIPV6;
 	inc.inc6_faddr = *dst;
 	if (in6_setscope(&inc.inc6_faddr, m->m_pkthdr.rcvif, NULL))
 		return;
 
 	if (mtu < tcp_maxmtu6(&inc, NULL)) {
 		tcp_hc_updatemtu(&inc, mtu);
 		V_icmp6stat.icp6s_pmtuchg++;
 	}
 }
 
 /*
  * Process a Node Information Query packet, based on
  * draft-ietf-ipngwg-icmp-name-lookups-07.
  *
  * Spec incompatibilities:
  * - IPv6 Subject address handling
  * - IPv4 Subject address handling support missing
  * - Proxy reply (answer even if it's not for me)
  * - joins NI group address at in6_ifattach() time only, does not cope
  *   with hostname changes by sethostname(3)
  */
 #define hostnamelen	strlen(V_hostname)
 static struct mbuf *
 ni6_input(struct mbuf *m, int off)
 {
 	INIT_VNET_INET6(curvnet);
 	INIT_VPROCG(TD_TO_VPROCG(curthread)); /* XXX V_hostname needs this */
 	struct icmp6_nodeinfo *ni6, *nni6;
 	struct mbuf *n = NULL;
 	u_int16_t qtype;
 	int subjlen;
 	int replylen = sizeof(struct ip6_hdr) + sizeof(struct icmp6_nodeinfo);
 	struct ni_reply_fqdn *fqdn;
 	int addrs;		/* for NI_QTYPE_NODEADDR */
 	struct ifnet *ifp = NULL; /* for NI_QTYPE_NODEADDR */
 	struct in6_addr in6_subj; /* subject address */
 	struct ip6_hdr *ip6;
 	int oldfqdn = 0;	/* if 1, return pascal string (03 draft) */
 	char *subj = NULL;
 	struct in6_ifaddr *ia6 = NULL;
 
 	ip6 = mtod(m, struct ip6_hdr *);
 #ifndef PULLDOWN_TEST
 	ni6 = (struct icmp6_nodeinfo *)(mtod(m, caddr_t) + off);
 #else
 	IP6_EXTHDR_GET(ni6, struct icmp6_nodeinfo *, m, off, sizeof(*ni6));
 	if (ni6 == NULL) {
 		/* m is already reclaimed */
 		return (NULL);
 	}
 #endif
 
 	/*
 	 * Validate IPv6 source address.
 	 * The default configuration MUST be to refuse answering queries from
 	 * global-scope addresses according to RFC4602.
 	 * Notes:
 	 *  - it's not very clear what "refuse" means; this implementation
 	 *    simply drops it.
 	 *  - it's not very easy to identify global-scope (unicast) addresses
 	 *    since there are many prefixes for them.  It should be safer
 	 *    and in practice sufficient to check "all" but loopback and
 	 *    link-local (note that site-local unicast was deprecated and
 	 *    ULA is defined as global scope-wise)
 	 */
 	if ((V_icmp6_nodeinfo & ICMP6_NODEINFO_GLOBALOK) == 0 &&
 	    !IN6_IS_ADDR_LOOPBACK(&ip6->ip6_src) &&
 	    !IN6_IS_ADDR_LINKLOCAL(&ip6->ip6_src))
 		goto bad;
 
 	/*
 	 * Validate IPv6 destination address.
 	 *
 	 * The Responder must discard the Query without further processing
 	 * unless it is one of the Responder's unicast or anycast addresses, or
 	 * a link-local scope multicast address which the Responder has joined.
 	 * [RFC4602, Section 5.]
 	 */
 	if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) {
 		if (!IN6_IS_ADDR_MC_LINKLOCAL(&ip6->ip6_dst))
 			goto bad;
 		/* else it's a link-local multicast, fine */
 	} else {		/* unicast or anycast */
 		if ((ia6 = ip6_getdstifaddr(m)) == NULL)
 			goto bad; /* XXX impossible */
 
 		if ((ia6->ia6_flags & IN6_IFF_TEMPORARY) &&
 		    !(V_icmp6_nodeinfo & ICMP6_NODEINFO_TMPADDROK)) {
 			nd6log((LOG_DEBUG, "ni6_input: ignore node info to "
 				"a temporary address in %s:%d",
 			       __FILE__, __LINE__));
 			goto bad;
 		}
 	}
 
 	/* validate query Subject field. */
 	qtype = ntohs(ni6->ni_qtype);
 	subjlen = m->m_pkthdr.len - off - sizeof(struct icmp6_nodeinfo);
 	switch (qtype) {
 	case NI_QTYPE_NOOP:
 	case NI_QTYPE_SUPTYPES:
 		/* 07 draft */
 		if (ni6->ni_code == ICMP6_NI_SUBJ_FQDN && subjlen == 0)
 			break;
 		/* FALLTHROUGH */
 	case NI_QTYPE_FQDN:
 	case NI_QTYPE_NODEADDR:
 	case NI_QTYPE_IPV4ADDR:
 		switch (ni6->ni_code) {
 		case ICMP6_NI_SUBJ_IPV6:
 #if ICMP6_NI_SUBJ_IPV6 != 0
 		case 0:
 #endif
 			/*
 			 * backward compatibility - try to accept 03 draft
 			 * format, where no Subject is present.
 			 */
 			if (qtype == NI_QTYPE_FQDN && ni6->ni_code == 0 &&
 			    subjlen == 0) {
 				oldfqdn++;
 				break;
 			}
 #if ICMP6_NI_SUBJ_IPV6 != 0
 			if (ni6->ni_code != ICMP6_NI_SUBJ_IPV6)
 				goto bad;
 #endif
 
 			if (subjlen != sizeof(struct in6_addr))
 				goto bad;
 
 			/*
 			 * Validate Subject address.
 			 *
 			 * Not sure what exactly "address belongs to the node"
 			 * means in the spec, is it just unicast, or what?
 			 *
 			 * At this moment we consider Subject address as
 			 * "belong to the node" if the Subject address equals
 			 * to the IPv6 destination address; validation for
 			 * IPv6 destination address should have done enough
 			 * check for us.
 			 *
 			 * We do not do proxy at this moment.
 			 */
 			/* m_pulldown instead of copy? */
 			m_copydata(m, off + sizeof(struct icmp6_nodeinfo),
 			    subjlen, (caddr_t)&in6_subj);
 			if (in6_setscope(&in6_subj, m->m_pkthdr.rcvif, NULL))
 				goto bad;
 
 			subj = (char *)&in6_subj;
 			if (IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, &in6_subj))
 				break;
 
 			/*
 			 * XXX if we are to allow other cases, we should really
 			 * be careful about scope here.
 			 * basically, we should disallow queries toward IPv6
 			 * destination X with subject Y,
 			 * if scope(X) > scope(Y).
 			 * if we allow scope(X) > scope(Y), it will result in
 			 * information leakage across scope boundary.
 			 */
 			goto bad;
 
 		case ICMP6_NI_SUBJ_FQDN:
 			/*
 			 * Validate Subject name with gethostname(3).
 			 *
 			 * The behavior may need some debate, since:
 			 * - we are not sure if the node has FQDN as
 			 *   hostname (returned by gethostname(3)).
 			 * - the code does wildcard match for truncated names.
 			 *   however, we are not sure if we want to perform
 			 *   wildcard match, if gethostname(3) side has
 			 *   truncated hostname.
 			 */
 			mtx_lock(&hostname_mtx);
 			n = ni6_nametodns(V_hostname, hostnamelen, 0);
 			mtx_unlock(&hostname_mtx);
 			if (!n || n->m_next || n->m_len == 0)
 				goto bad;
 			IP6_EXTHDR_GET(subj, char *, m,
 			    off + sizeof(struct icmp6_nodeinfo), subjlen);
 			if (subj == NULL)
 				goto bad;
 			if (!ni6_dnsmatch(subj, subjlen, mtod(n, const char *),
 			    n->m_len)) {
 				goto bad;
 			}
 			m_freem(n);
 			n = NULL;
 			break;
 
 		case ICMP6_NI_SUBJ_IPV4:	/* XXX: to be implemented? */
 		default:
 			goto bad;
 		}
 		break;
 	}
 
 	/* refuse based on configuration.  XXX ICMP6_NI_REFUSED? */
 	switch (qtype) {
 	case NI_QTYPE_FQDN:
 		if ((V_icmp6_nodeinfo & ICMP6_NODEINFO_FQDNOK) == 0)
 			goto bad;
 		break;
 	case NI_QTYPE_NODEADDR:
 	case NI_QTYPE_IPV4ADDR:
 		if ((V_icmp6_nodeinfo & ICMP6_NODEINFO_NODEADDROK) == 0)
 			goto bad;
 		break;
 	}
 
 	/* guess reply length */
 	switch (qtype) {
 	case NI_QTYPE_NOOP:
 		break;		/* no reply data */
 	case NI_QTYPE_SUPTYPES:
 		replylen += sizeof(u_int32_t);
 		break;
 	case NI_QTYPE_FQDN:
 		/* XXX will append an mbuf */
 		replylen += offsetof(struct ni_reply_fqdn, ni_fqdn_namelen);
 		break;
 	case NI_QTYPE_NODEADDR:
 		addrs = ni6_addrs(ni6, m, &ifp, (struct in6_addr *)subj);
 		if ((replylen += addrs * (sizeof(struct in6_addr) +
 		    sizeof(u_int32_t))) > MCLBYTES)
 			replylen = MCLBYTES; /* XXX: will truncate pkt later */
 		break;
 	case NI_QTYPE_IPV4ADDR:
 		/* unsupported - should respond with unknown Qtype? */
 		break;
 	default:
 		/*
 		 * XXX: We must return a reply with the ICMP6 code
 		 * `unknown Qtype' in this case.  However we regard the case
 		 * as an FQDN query for backward compatibility.
 		 * Older versions set a random value to this field,
 		 * so it rarely varies in the defined qtypes.
 		 * But the mechanism is not reliable...
 		 * maybe we should obsolete older versions.
 		 */
 		qtype = NI_QTYPE_FQDN;
 		/* XXX will append an mbuf */
 		replylen += offsetof(struct ni_reply_fqdn, ni_fqdn_namelen);
 		oldfqdn++;
 		break;
 	}
 
 	/* allocate an mbuf to reply. */
 	MGETHDR(n, M_DONTWAIT, m->m_type);
 	if (n == NULL) {
 		m_freem(m);
 		return (NULL);
 	}
 	M_MOVE_PKTHDR(n, m); /* just for recvif */
 	if (replylen > MHLEN) {
 		if (replylen > MCLBYTES) {
 			/*
 			 * XXX: should we try to allocate more? But MCLBYTES
 			 * is probably much larger than IPV6_MMTU...
 			 */
 			goto bad;
 		}
 		MCLGET(n, M_DONTWAIT);
 		if ((n->m_flags & M_EXT) == 0) {
 			goto bad;
 		}
 	}
 	n->m_pkthdr.len = n->m_len = replylen;
 
 	/* copy mbuf header and IPv6 + Node Information base headers */
 	bcopy(mtod(m, caddr_t), mtod(n, caddr_t), sizeof(struct ip6_hdr));
 	nni6 = (struct icmp6_nodeinfo *)(mtod(n, struct ip6_hdr *) + 1);
 	bcopy((caddr_t)ni6, (caddr_t)nni6, sizeof(struct icmp6_nodeinfo));
 
 	/* qtype dependent procedure */
 	switch (qtype) {
 	case NI_QTYPE_NOOP:
 		nni6->ni_code = ICMP6_NI_SUCCESS;
 		nni6->ni_flags = 0;
 		break;
 	case NI_QTYPE_SUPTYPES:
 	{
 		u_int32_t v;
 		nni6->ni_code = ICMP6_NI_SUCCESS;
 		nni6->ni_flags = htons(0x0000);	/* raw bitmap */
 		/* supports NOOP, SUPTYPES, FQDN, and NODEADDR */
 		v = (u_int32_t)htonl(0x0000000f);
 		bcopy(&v, nni6 + 1, sizeof(u_int32_t));
 		break;
 	}
 	case NI_QTYPE_FQDN:
 		nni6->ni_code = ICMP6_NI_SUCCESS;
 		fqdn = (struct ni_reply_fqdn *)(mtod(n, caddr_t) +
 		    sizeof(struct ip6_hdr) + sizeof(struct icmp6_nodeinfo));
 		nni6->ni_flags = 0; /* XXX: meaningless TTL */
 		fqdn->ni_fqdn_ttl = 0;	/* ditto. */
 		/*
 		 * XXX do we really have FQDN in variable "hostname"?
 		 */
 		mtx_lock(&hostname_mtx);
 		n->m_next = ni6_nametodns(V_hostname, hostnamelen, oldfqdn);
 		mtx_unlock(&hostname_mtx);
 		if (n->m_next == NULL)
 			goto bad;
 		/* XXX we assume that n->m_next is not a chain */
 		if (n->m_next->m_next != NULL)
 			goto bad;
 		n->m_pkthdr.len += n->m_next->m_len;
 		break;
 	case NI_QTYPE_NODEADDR:
 	{
 		int lenlim, copied;
 
 		nni6->ni_code = ICMP6_NI_SUCCESS;
 		n->m_pkthdr.len = n->m_len =
 		    sizeof(struct ip6_hdr) + sizeof(struct icmp6_nodeinfo);
 		lenlim = M_TRAILINGSPACE(n);
 		copied = ni6_store_addrs(ni6, nni6, ifp, lenlim);
 		/* XXX: reset mbuf length */
 		n->m_pkthdr.len = n->m_len = sizeof(struct ip6_hdr) +
 		    sizeof(struct icmp6_nodeinfo) + copied;
 		break;
 	}
 	default:
 		break;		/* XXX impossible! */
 	}
 
 	nni6->ni_type = ICMP6_NI_REPLY;
 	m_freem(m);
 	return (n);
 
   bad:
 	m_freem(m);
 	if (n)
 		m_freem(n);
 	return (NULL);
 }
 #undef hostnamelen
 
 /*
  * make a mbuf with DNS-encoded string.  no compression support.
  *
  * XXX names with less than 2 dots (like "foo" or "foo.section") will be
  * treated as truncated name (two \0 at the end).  this is a wild guess.
  *
  * old - return pascal string if non-zero
  */
 static struct mbuf *
 ni6_nametodns(const char *name, int namelen, int old)
 {
 	struct mbuf *m;
 	char *cp, *ep;
 	const char *p, *q;
 	int i, len, nterm;
 
 	if (old)
 		len = namelen + 1;
 	else
 		len = MCLBYTES;
 
 	/* because MAXHOSTNAMELEN is usually 256, we use cluster mbuf */
 	MGET(m, M_DONTWAIT, MT_DATA);
 	if (m && len > MLEN) {
 		MCLGET(m, M_DONTWAIT);
 		if ((m->m_flags & M_EXT) == 0)
 			goto fail;
 	}
 	if (!m)
 		goto fail;
 	m->m_next = NULL;
 
 	if (old) {
 		m->m_len = len;
 		*mtod(m, char *) = namelen;
 		bcopy(name, mtod(m, char *) + 1, namelen);
 		return m;
 	} else {
 		m->m_len = 0;
 		cp = mtod(m, char *);
 		ep = mtod(m, char *) + M_TRAILINGSPACE(m);
 
 		/* if not certain about my name, return empty buffer */
 		if (namelen == 0)
 			return m;
 
 		/*
 		 * guess if it looks like shortened hostname, or FQDN.
 		 * shortened hostname needs two trailing "\0".
 		 */
 		i = 0;
 		for (p = name; p < name + namelen; p++) {
 			if (*p && *p == '.')
 				i++;
 		}
 		if (i < 2)
 			nterm = 2;
 		else
 			nterm = 1;
 
 		p = name;
 		while (cp < ep && p < name + namelen) {
 			i = 0;
 			for (q = p; q < name + namelen && *q && *q != '.'; q++)
 				i++;
 			/* result does not fit into mbuf */
 			if (cp + i + 1 >= ep)
 				goto fail;
 			/*
 			 * DNS label length restriction, RFC1035 page 8.
 			 * "i == 0" case is included here to avoid returning
 			 * 0-length label on "foo..bar".
 			 */
 			if (i <= 0 || i >= 64)
 				goto fail;
 			*cp++ = i;
 			bcopy(p, cp, i);
 			cp += i;
 			p = q;
 			if (p < name + namelen && *p == '.')
 				p++;
 		}
 		/* termination */
 		if (cp + nterm >= ep)
 			goto fail;
 		while (nterm-- > 0)
 			*cp++ = '\0';
 		m->m_len = cp - mtod(m, char *);
 		return m;
 	}
 
 	panic("should not reach here");
 	/* NOTREACHED */
 
  fail:
 	if (m)
 		m_freem(m);
 	return NULL;
 }
 
 /*
  * check if two DNS-encoded string matches.  takes care of truncated
  * form (with \0\0 at the end).  no compression support.
  * XXX upper/lowercase match (see RFC2065)
  */
 static int
 ni6_dnsmatch(const char *a, int alen, const char *b, int blen)
 {
 	const char *a0, *b0;
 	int l;
 
 	/* simplest case - need validation? */
 	if (alen == blen && bcmp(a, b, alen) == 0)
 		return 1;
 
 	a0 = a;
 	b0 = b;
 
 	/* termination is mandatory */
 	if (alen < 2 || blen < 2)
 		return 0;
 	if (a0[alen - 1] != '\0' || b0[blen - 1] != '\0')
 		return 0;
 	alen--;
 	blen--;
 
 	while (a - a0 < alen && b - b0 < blen) {
 		if (a - a0 + 1 > alen || b - b0 + 1 > blen)
 			return 0;
 
 		if ((signed char)a[0] < 0 || (signed char)b[0] < 0)
 			return 0;
 		/* we don't support compression yet */
 		if (a[0] >= 64 || b[0] >= 64)
 			return 0;
 
 		/* truncated case */
 		if (a[0] == 0 && a - a0 == alen - 1)
 			return 1;
 		if (b[0] == 0 && b - b0 == blen - 1)
 			return 1;
 		if (a[0] == 0 || b[0] == 0)
 			return 0;
 
 		if (a[0] != b[0])
 			return 0;
 		l = a[0];
 		if (a - a0 + 1 + l > alen || b - b0 + 1 + l > blen)
 			return 0;
 		if (bcmp(a + 1, b + 1, l) != 0)
 			return 0;
 
 		a += 1 + l;
 		b += 1 + l;
 	}
 
 	if (a - a0 == alen && b - b0 == blen)
 		return 1;
 	else
 		return 0;
 }
 
 /*
  * calculate the number of addresses to be returned in the node info reply.
  */
 static int
 ni6_addrs(struct icmp6_nodeinfo *ni6, struct mbuf *m, struct ifnet **ifpp,
     struct in6_addr *subj)
 {
 	INIT_VNET_NET(curvnet);
 	INIT_VNET_INET6(curvnet);
 	struct ifnet *ifp;
 	struct in6_ifaddr *ifa6;
 	struct ifaddr *ifa;
 	int addrs = 0, addrsofif, iffound = 0;
 	int niflags = ni6->ni_flags;
 
 	if ((niflags & NI_NODEADDR_FLAG_ALL) == 0) {
 		switch (ni6->ni_code) {
 		case ICMP6_NI_SUBJ_IPV6:
 			if (subj == NULL) /* must be impossible... */
 				return (0);
 			break;
 		default:
 			/*
 			 * XXX: we only support IPv6 subject address for
 			 * this Qtype.
 			 */
 			return (0);
 		}
 	}
 
 	IFNET_RLOCK();
 	for (ifp = TAILQ_FIRST(&V_ifnet); ifp; ifp = TAILQ_NEXT(ifp, if_list)) {
 		addrsofif = 0;
 		TAILQ_FOREACH(ifa, &ifp->if_addrlist, ifa_list) {
 			if (ifa->ifa_addr->sa_family != AF_INET6)
 				continue;
 			ifa6 = (struct in6_ifaddr *)ifa;
 
 			if ((niflags & NI_NODEADDR_FLAG_ALL) == 0 &&
 			    IN6_ARE_ADDR_EQUAL(subj, &ifa6->ia_addr.sin6_addr))
 				iffound = 1;
 
 			/*
 			 * IPv4-mapped addresses can only be returned by a
 			 * Node Information proxy, since they represent
 			 * addresses of IPv4-only nodes, which perforce do
 			 * not implement this protocol.
 			 * [icmp-name-lookups-07, Section 5.4]
 			 * So we don't support NI_NODEADDR_FLAG_COMPAT in
 			 * this function at this moment.
 			 */
 
 			/* What do we have to do about ::1? */
 			switch (in6_addrscope(&ifa6->ia_addr.sin6_addr)) {
 			case IPV6_ADDR_SCOPE_LINKLOCAL:
 				if ((niflags & NI_NODEADDR_FLAG_LINKLOCAL) == 0)
 					continue;
 				break;
 			case IPV6_ADDR_SCOPE_SITELOCAL:
 				if ((niflags & NI_NODEADDR_FLAG_SITELOCAL) == 0)
 					continue;
 				break;
 			case IPV6_ADDR_SCOPE_GLOBAL:
 				if ((niflags & NI_NODEADDR_FLAG_GLOBAL) == 0)
 					continue;
 				break;
 			default:
 				continue;
 			}
 
 			/*
 			 * check if anycast is okay.
 			 * XXX: just experimental.  not in the spec.
 			 */
 			if ((ifa6->ia6_flags & IN6_IFF_ANYCAST) != 0 &&
 			    (niflags & NI_NODEADDR_FLAG_ANYCAST) == 0)
 				continue; /* we need only unicast addresses */
 			if ((ifa6->ia6_flags & IN6_IFF_TEMPORARY) != 0 &&
 			    (V_icmp6_nodeinfo & ICMP6_NODEINFO_TMPADDROK) == 0) {
 				continue;
 			}
 			addrsofif++; /* count the address */
 		}
 		if (iffound) {
 			*ifpp = ifp;
 			IFNET_RUNLOCK();
 			return (addrsofif);
 		}
 
 		addrs += addrsofif;
 	}
 	IFNET_RUNLOCK();
 
 	return (addrs);
 }
 
 static int
 ni6_store_addrs(struct icmp6_nodeinfo *ni6, struct icmp6_nodeinfo *nni6,
     struct ifnet *ifp0, int resid)
 {
 	INIT_VNET_NET(curvnet);
 	INIT_VNET_INET6(curvnet);
 	struct ifnet *ifp = ifp0 ? ifp0 : TAILQ_FIRST(&V_ifnet);
 	struct in6_ifaddr *ifa6;
 	struct ifaddr *ifa;
 	struct ifnet *ifp_dep = NULL;
 	int copied = 0, allow_deprecated = 0;
 	u_char *cp = (u_char *)(nni6 + 1);
 	int niflags = ni6->ni_flags;
 	u_int32_t ltime;
 
 	if (ifp0 == NULL && !(niflags & NI_NODEADDR_FLAG_ALL))
 		return (0);	/* needless to copy */
 
 	IFNET_RLOCK();
   again:
 
 	for (; ifp; ifp = TAILQ_NEXT(ifp, if_list)) {
 		for (ifa = ifp->if_addrlist.tqh_first; ifa;
 		     ifa = ifa->ifa_list.tqe_next) {
 			if (ifa->ifa_addr->sa_family != AF_INET6)
 				continue;
 			ifa6 = (struct in6_ifaddr *)ifa;
 
 			if ((ifa6->ia6_flags & IN6_IFF_DEPRECATED) != 0 &&
 			    allow_deprecated == 0) {
 				/*
 				 * prefererred address should be put before
 				 * deprecated addresses.
 				 */
 
 				/* record the interface for later search */
 				if (ifp_dep == NULL)
 					ifp_dep = ifp;
 
 				continue;
 			} else if ((ifa6->ia6_flags & IN6_IFF_DEPRECATED) == 0 &&
 			    allow_deprecated != 0)
 				continue; /* we now collect deprecated addrs */
 
 			/* What do we have to do about ::1? */
 			switch (in6_addrscope(&ifa6->ia_addr.sin6_addr)) {
 			case IPV6_ADDR_SCOPE_LINKLOCAL:
 				if ((niflags & NI_NODEADDR_FLAG_LINKLOCAL) == 0)
 					continue;
 				break;
 			case IPV6_ADDR_SCOPE_SITELOCAL:
 				if ((niflags & NI_NODEADDR_FLAG_SITELOCAL) == 0)
 					continue;
 				break;
 			case IPV6_ADDR_SCOPE_GLOBAL:
 				if ((niflags & NI_NODEADDR_FLAG_GLOBAL) == 0)
 					continue;
 				break;
 			default:
 				continue;
 			}
 
 			/*
 			 * check if anycast is okay.
 			 * XXX: just experimental.  not in the spec.
 			 */
 			if ((ifa6->ia6_flags & IN6_IFF_ANYCAST) != 0 &&
 			    (niflags & NI_NODEADDR_FLAG_ANYCAST) == 0)
 				continue;
 			if ((ifa6->ia6_flags & IN6_IFF_TEMPORARY) != 0 &&
 			    (V_icmp6_nodeinfo & ICMP6_NODEINFO_TMPADDROK) == 0) {
 				continue;
 			}
 
 			/* now we can copy the address */
 			if (resid < sizeof(struct in6_addr) +
 			    sizeof(u_int32_t)) {
 				/*
 				 * We give up much more copy.
 				 * Set the truncate flag and return.
 				 */
 				nni6->ni_flags |= NI_NODEADDR_FLAG_TRUNCATE;
 				IFNET_RUNLOCK();
 				return (copied);
 			}
 
 			/*
 			 * Set the TTL of the address.
 			 * The TTL value should be one of the following
 			 * according to the specification:
 			 *
 			 * 1. The remaining lifetime of a DHCP lease on the
 			 *    address, or
 			 * 2. The remaining Valid Lifetime of a prefix from
 			 *    which the address was derived through Stateless
 			 *    Autoconfiguration.
 			 *
 			 * Note that we currently do not support stateful
 			 * address configuration by DHCPv6, so the former
 			 * case can't happen.
 			 */
 			if (ifa6->ia6_lifetime.ia6t_expire == 0)
 				ltime = ND6_INFINITE_LIFETIME;
 			else {
 				if (ifa6->ia6_lifetime.ia6t_expire >
 				    time_second)
 					ltime = htonl(ifa6->ia6_lifetime.ia6t_expire - time_second);
 				else
 					ltime = 0;
 			}
 
 			bcopy(&ltime, cp, sizeof(u_int32_t));
 			cp += sizeof(u_int32_t);
 
 			/* copy the address itself */
 			bcopy(&ifa6->ia_addr.sin6_addr, cp,
 			    sizeof(struct in6_addr));
 			in6_clearscope((struct in6_addr *)cp); /* XXX */
 			cp += sizeof(struct in6_addr);
 
 			resid -= (sizeof(struct in6_addr) + sizeof(u_int32_t));
 			copied += (sizeof(struct in6_addr) + sizeof(u_int32_t));
 		}
 		if (ifp0)	/* we need search only on the specified IF */
 			break;
 	}
 
 	if (allow_deprecated == 0 && ifp_dep != NULL) {
 		ifp = ifp_dep;
 		allow_deprecated = 1;
 
 		goto again;
 	}
 
 	IFNET_RUNLOCK();
 
 	return (copied);
 }
 
 /*
  * XXX almost dup'ed code with rip6_input.
  */
 static int
 icmp6_rip6_input(struct mbuf **mp, int off)
 {
 	INIT_VNET_INET(curvnet);
 	INIT_VNET_INET6(curvnet);
 	struct mbuf *m = *mp;
 	struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
 	struct inpcb *in6p;
 	struct inpcb *last = NULL;
 	struct sockaddr_in6 fromsa;
 	struct icmp6_hdr *icmp6;
 	struct mbuf *opts = NULL;
 
 #ifndef PULLDOWN_TEST
 	/* this is assumed to be safe. */
 	icmp6 = (struct icmp6_hdr *)((caddr_t)ip6 + off);
 #else
 	IP6_EXTHDR_GET(icmp6, struct icmp6_hdr *, m, off, sizeof(*icmp6));
 	if (icmp6 == NULL) {
 		/* m is already reclaimed */
 		return (IPPROTO_DONE);
 	}
 #endif
 
 	/*
 	 * XXX: the address may have embedded scope zone ID, which should be
 	 * hidden from applications.
 	 */
 	bzero(&fromsa, sizeof(fromsa));
 	fromsa.sin6_family = AF_INET6;
 	fromsa.sin6_len = sizeof(struct sockaddr_in6);
 	fromsa.sin6_addr = ip6->ip6_src;
 	if (sa6_recoverscope(&fromsa)) {
 		m_freem(m);
 		return (IPPROTO_DONE);
 	}
 
 	INP_INFO_RLOCK(&V_ripcbinfo);
 	LIST_FOREACH(in6p, &V_ripcb, inp_list) {
 		if ((in6p->inp_vflag & INP_IPV6) == 0)
 			continue;
 		if (in6p->inp_ip_p != IPPROTO_ICMPV6)
 			continue;
 		if (!IN6_IS_ADDR_UNSPECIFIED(&in6p->in6p_laddr) &&
 		   !IN6_ARE_ADDR_EQUAL(&in6p->in6p_laddr, &ip6->ip6_dst))
 			continue;
 		if (!IN6_IS_ADDR_UNSPECIFIED(&in6p->in6p_faddr) &&
 		   !IN6_ARE_ADDR_EQUAL(&in6p->in6p_faddr, &ip6->ip6_src))
 			continue;
 		INP_RLOCK(in6p);
 		if (ICMP6_FILTER_WILLBLOCK(icmp6->icmp6_type,
 		    in6p->in6p_icmp6filt)) {
 			INP_RUNLOCK(in6p);
 			continue;
 		}
 		if (last != NULL) {
 			struct	mbuf *n = NULL;
 
 			/*
 			 * Recent network drivers tend to allocate a single
 			 * mbuf cluster, rather than to make a couple of
 			 * mbufs without clusters.  Also, since the IPv6 code
 			 * path tries to avoid m_pullup(), it is highly
 			 * probable that we still have an mbuf cluster here
 			 * even though the necessary length can be stored in an
 			 * mbuf's internal buffer.
 			 * Meanwhile, the default size of the receive socket
 			 * buffer for raw sockets is not so large.  This means
 			 * the possibility of packet loss is relatively higher
 			 * than before.  To avoid this scenario, we copy the
 			 * received data to a separate mbuf that does not use
 			 * a cluster, if possible.
 			 * XXX: it is better to copy the data after stripping
 			 * intermediate headers.
 			 */
 			if ((m->m_flags & M_EXT) && m->m_next == NULL &&
 			    m->m_len <= MHLEN) {
 				MGET(n, M_DONTWAIT, m->m_type);
 				if (n != NULL) {
 					if (m_dup_pkthdr(n, m, M_NOWAIT)) {
 						bcopy(m->m_data, n->m_data,
 						      m->m_len);
 						n->m_len = m->m_len;
 					} else {
 						m_free(n);
 						n = NULL;
 					}
 				}
 			}
 			if (n != NULL ||
 			    (n = m_copy(m, 0, (int)M_COPYALL)) != NULL) {
 				if (last->inp_flags & IN6P_CONTROLOPTS)
 					ip6_savecontrol(last, n, &opts);
 				/* strip intermediate headers */
 				m_adj(n, off);
 				SOCKBUF_LOCK(&last->inp_socket->so_rcv);
 				if (sbappendaddr_locked(
 				    &last->inp_socket->so_rcv,
 				    (struct sockaddr *)&fromsa, n, opts)
 				    == 0) {
 					/* should notify about lost packet */
 					m_freem(n);
 					if (opts) {
 						m_freem(opts);
 					}
 					SOCKBUF_UNLOCK(
 					    &last->inp_socket->so_rcv);
 				} else
 					sorwakeup_locked(last->inp_socket);
 				opts = NULL;
 			}
 			INP_RUNLOCK(last);
 		}
 		last = in6p;
 	}
 	INP_INFO_RUNLOCK(&V_ripcbinfo);
 	if (last != NULL) {
 		if (last->inp_flags & IN6P_CONTROLOPTS)
 			ip6_savecontrol(last, m, &opts);
 		/* strip intermediate headers */
 		m_adj(m, off);
 
 		/* avoid using mbuf clusters if possible (see above) */
 		if ((m->m_flags & M_EXT) && m->m_next == NULL &&
 		    m->m_len <= MHLEN) {
 			struct mbuf *n;
 
 			MGET(n, M_DONTWAIT, m->m_type);
 			if (n != NULL) {
 				if (m_dup_pkthdr(n, m, M_NOWAIT)) {
 					bcopy(m->m_data, n->m_data, m->m_len);
 					n->m_len = m->m_len;
 
 					m_freem(m);
 					m = n;
 				} else {
 					m_freem(n);
 					n = NULL;
 				}
 			}
 		}
 		SOCKBUF_LOCK(&last->inp_socket->so_rcv);
 		if (sbappendaddr_locked(&last->inp_socket->so_rcv,
 		    (struct sockaddr *)&fromsa, m, opts) == 0) {
 			m_freem(m);
 			if (opts)
 				m_freem(opts);
 			SOCKBUF_UNLOCK(&last->inp_socket->so_rcv);
 		} else
 			sorwakeup_locked(last->inp_socket);
 		INP_RUNLOCK(last);
 	} else {
 		m_freem(m);
 		V_ip6stat.ip6s_delivered--;
 	}
 	return IPPROTO_DONE;
 }
 
 /*
  * Reflect the ip6 packet back to the source.
  * OFF points to the icmp6 header, counted from the top of the mbuf.
  */
 void
 icmp6_reflect(struct mbuf *m, size_t off)
 {
 	INIT_VNET_INET6(curvnet);
 	struct ip6_hdr *ip6;
 	struct icmp6_hdr *icmp6;
 	struct in6_ifaddr *ia;
 	int plen;
 	int type, code;
 	struct ifnet *outif = NULL;
 	struct in6_addr origdst, *src = NULL;
 
 	/* too short to reflect */
 	if (off < sizeof(struct ip6_hdr)) {
 		nd6log((LOG_DEBUG,
 		    "sanity fail: off=%lx, sizeof(ip6)=%lx in %s:%d\n",
 		    (u_long)off, (u_long)sizeof(struct ip6_hdr),
 		    __FILE__, __LINE__));
 		goto bad;
 	}
 
 	/*
 	 * If there are extra headers between IPv6 and ICMPv6, strip
 	 * off that header first.
 	 */
 #ifdef DIAGNOSTIC
 	if (sizeof(struct ip6_hdr) + sizeof(struct icmp6_hdr) > MHLEN)
 		panic("assumption failed in icmp6_reflect");
 #endif
 	if (off > sizeof(struct ip6_hdr)) {
 		size_t l;
 		struct ip6_hdr nip6;
 
 		l = off - sizeof(struct ip6_hdr);
 		m_copydata(m, 0, sizeof(nip6), (caddr_t)&nip6);
 		m_adj(m, l);
 		l = sizeof(struct ip6_hdr) + sizeof(struct icmp6_hdr);
 		if (m->m_len < l) {
 			if ((m = m_pullup(m, l)) == NULL)
 				return;
 		}
 		bcopy((caddr_t)&nip6, mtod(m, caddr_t), sizeof(nip6));
 	} else /* off == sizeof(struct ip6_hdr) */ {
 		size_t l;
 		l = sizeof(struct ip6_hdr) + sizeof(struct icmp6_hdr);
 		if (m->m_len < l) {
 			if ((m = m_pullup(m, l)) == NULL)
 				return;
 		}
 	}
 	plen = m->m_pkthdr.len - sizeof(struct ip6_hdr);
 	ip6 = mtod(m, struct ip6_hdr *);
 	ip6->ip6_nxt = IPPROTO_ICMPV6;
 	icmp6 = (struct icmp6_hdr *)(ip6 + 1);
 	type = icmp6->icmp6_type; /* keep type for statistics */
 	code = icmp6->icmp6_code; /* ditto. */
 
 	origdst = ip6->ip6_dst;
 	/*
 	 * ip6_input() drops a packet if its src is multicast.
 	 * So, the src is never multicast.
 	 */
 	ip6->ip6_dst = ip6->ip6_src;
 
 	/*
 	 * If the incoming packet was addressed directly to us (i.e. unicast),
 	 * use dst as the src for the reply.
 	 * The IN6_IFF_NOTREADY case should be VERY rare, but is possible
 	 * (for example) when we encounter an error while forwarding procedure
 	 * destined to a duplicated address of ours.
 	 * Note that ip6_getdstifaddr() may fail if we are in an error handling
 	 * procedure of an outgoing packet of our own, in which case we need
 	 * to search in the ifaddr list.
 	 */
 	if (!IN6_IS_ADDR_MULTICAST(&origdst)) {
 		if ((ia = ip6_getdstifaddr(m))) {
 			if (!(ia->ia6_flags &
 			    (IN6_IFF_ANYCAST|IN6_IFF_NOTREADY)))
 				src = &ia->ia_addr.sin6_addr;
 		} else {
 			struct sockaddr_in6 d;
 
 			bzero(&d, sizeof(d));
 			d.sin6_family = AF_INET6;
 			d.sin6_len = sizeof(d);
 			d.sin6_addr = origdst;
 			ia = (struct in6_ifaddr *)
 			    ifa_ifwithaddr((struct sockaddr *)&d);
 			if (ia &&
 			    !(ia->ia6_flags &
 			    (IN6_IFF_ANYCAST|IN6_IFF_NOTREADY))) {
 				src = &ia->ia_addr.sin6_addr;
 			}
 		}
 	}
 
 	if (src == NULL) {
 		int e;
 		struct sockaddr_in6 sin6;
 		struct route_in6 ro;
 
 		/*
 		 * This case matches to multicasts, our anycast, or unicasts
 		 * that we do not own.  Select a source address based on the
 		 * source address of the erroneous packet.
 		 */
 		bzero(&sin6, sizeof(sin6));
 		sin6.sin6_family = AF_INET6;
 		sin6.sin6_len = sizeof(sin6);
 		sin6.sin6_addr = ip6->ip6_dst; /* zone ID should be embedded */
 
 		bzero(&ro, sizeof(ro));
 		src = in6_selectsrc(&sin6, NULL, NULL, &ro, NULL, &outif, &e);
 		if (ro.ro_rt)
 			RTFREE(ro.ro_rt); /* XXX: we could use this */
 		if (src == NULL) {
 			char ip6buf[INET6_ADDRSTRLEN];
 			nd6log((LOG_DEBUG,
 			    "icmp6_reflect: source can't be determined: "
 			    "dst=%s, error=%d\n",
 			    ip6_sprintf(ip6buf, &sin6.sin6_addr), e));
 			goto bad;
 		}
 	}
 
 	ip6->ip6_src = *src;
 	ip6->ip6_flow = 0;
 	ip6->ip6_vfc &= ~IPV6_VERSION_MASK;
 	ip6->ip6_vfc |= IPV6_VERSION;
 	ip6->ip6_nxt = IPPROTO_ICMPV6;
 	if (outif)
 		ip6->ip6_hlim = ND_IFINFO(outif)->chlim;
 	else if (m->m_pkthdr.rcvif) {
 		/* XXX: This may not be the outgoing interface */
 		ip6->ip6_hlim = ND_IFINFO(m->m_pkthdr.rcvif)->chlim;
 	} else
 		ip6->ip6_hlim = V_ip6_defhlim;
 
 	icmp6->icmp6_cksum = 0;
 	icmp6->icmp6_cksum = in6_cksum(m, IPPROTO_ICMPV6,
 	    sizeof(struct ip6_hdr), plen);
 
 	/*
 	 * XXX option handling
 	 */
 
 	m->m_flags &= ~(M_BCAST|M_MCAST);
 
 	ip6_output(m, NULL, NULL, 0, NULL, &outif, NULL);
 	if (outif)
 		icmp6_ifoutstat_inc(outif, type, code);
 
 	return;
 
  bad:
 	m_freem(m);
 	return;
 }
 
 void
 icmp6_fasttimo(void)
 {
 
 	return;
 }
 
 static const char *
 icmp6_redirect_diag(struct in6_addr *src6, struct in6_addr *dst6,
     struct in6_addr *tgt6)
 {
 	static char buf[1024];
 	char ip6bufs[INET6_ADDRSTRLEN];
 	char ip6bufd[INET6_ADDRSTRLEN];
 	char ip6buft[INET6_ADDRSTRLEN];
 	snprintf(buf, sizeof(buf), "(src=%s dst=%s tgt=%s)",
 	    ip6_sprintf(ip6bufs, src6), ip6_sprintf(ip6bufd, dst6),
 	    ip6_sprintf(ip6buft, tgt6));
 	return buf;
 }
 
 void
 icmp6_redirect_input(struct mbuf *m, int off)
 {
 	INIT_VNET_INET6(curvnet);
 	struct ifnet *ifp;
 	struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
 	struct nd_redirect *nd_rd;
 	int icmp6len = ntohs(ip6->ip6_plen);
 	char *lladdr = NULL;
 	int lladdrlen = 0;
 	u_char *redirhdr = NULL;
 	int redirhdrlen = 0;
 	struct rtentry *rt = NULL;
 	int is_router;
 	int is_onlink;
 	struct in6_addr src6 = ip6->ip6_src;
 	struct in6_addr redtgt6;
 	struct in6_addr reddst6;
 	union nd_opts ndopts;
 	char ip6buf[INET6_ADDRSTRLEN];
 
 	if (!m)
 		return;
 
 	ifp = m->m_pkthdr.rcvif;
 
 	if (!ifp)
 		return;
 
 	/* XXX if we are router, we don't update route by icmp6 redirect */
 	if (V_ip6_forwarding)
 		goto freeit;
 	if (!V_icmp6_rediraccept)
 		goto freeit;
 
 #ifndef PULLDOWN_TEST
 	IP6_EXTHDR_CHECK(m, off, icmp6len,);
 	nd_rd = (struct nd_redirect *)((caddr_t)ip6 + off);
 #else
 	IP6_EXTHDR_GET(nd_rd, struct nd_redirect *, m, off, icmp6len);
 	if (nd_rd == NULL) {
 		V_icmp6stat.icp6s_tooshort++;
 		return;
 	}
 #endif
 	redtgt6 = nd_rd->nd_rd_target;
 	reddst6 = nd_rd->nd_rd_dst;
 
 	if (in6_setscope(&redtgt6, m->m_pkthdr.rcvif, NULL) ||
 	    in6_setscope(&reddst6, m->m_pkthdr.rcvif, NULL)) {
 		goto freeit;
 	}
 
 	/* validation */
 	if (!IN6_IS_ADDR_LINKLOCAL(&src6)) {
 		nd6log((LOG_ERR,
 		    "ICMP6 redirect sent from %s rejected; "
 		    "must be from linklocal\n",
 		    ip6_sprintf(ip6buf, &src6)));
 		goto bad;
 	}
 	if (ip6->ip6_hlim != 255) {
 		nd6log((LOG_ERR,
 		    "ICMP6 redirect sent from %s rejected; "
 		    "hlim=%d (must be 255)\n",
 		    ip6_sprintf(ip6buf, &src6), ip6->ip6_hlim));
 		goto bad;
 	}
     {
 	/* ip6->ip6_src must be equal to gw for icmp6->icmp6_reddst */
 	struct sockaddr_in6 sin6;
 	struct in6_addr *gw6;
 
 	bzero(&sin6, sizeof(sin6));
 	sin6.sin6_family = AF_INET6;
 	sin6.sin6_len = sizeof(struct sockaddr_in6);
 	bcopy(&reddst6, &sin6.sin6_addr, sizeof(reddst6));
 	rt = rtalloc1((struct sockaddr *)&sin6, 0, 0UL);
 	if (rt) {
 		if (rt->rt_gateway == NULL ||
 		    rt->rt_gateway->sa_family != AF_INET6) {
 			nd6log((LOG_ERR,
 			    "ICMP6 redirect rejected; no route "
 			    "with inet6 gateway found for redirect dst: %s\n",
 			    icmp6_redirect_diag(&src6, &reddst6, &redtgt6)));
 			RTFREE_LOCKED(rt);
 			goto bad;
 		}
 
 		gw6 = &(((struct sockaddr_in6 *)rt->rt_gateway)->sin6_addr);
 		if (bcmp(&src6, gw6, sizeof(struct in6_addr)) != 0) {
 			nd6log((LOG_ERR,
 			    "ICMP6 redirect rejected; "
 			    "not equal to gw-for-src=%s (must be same): "
 			    "%s\n",
 			    ip6_sprintf(ip6buf, gw6),
 			    icmp6_redirect_diag(&src6, &reddst6, &redtgt6)));
 			RTFREE_LOCKED(rt);
 			goto bad;
 		}
 	} else {
 		nd6log((LOG_ERR,
 		    "ICMP6 redirect rejected; "
 		    "no route found for redirect dst: %s\n",
 		    icmp6_redirect_diag(&src6, &reddst6, &redtgt6)));
 		goto bad;
 	}
 	RTFREE_LOCKED(rt);
 	rt = NULL;
     }
 	if (IN6_IS_ADDR_MULTICAST(&reddst6)) {
 		nd6log((LOG_ERR,
 		    "ICMP6 redirect rejected; "
 		    "redirect dst must be unicast: %s\n",
 		    icmp6_redirect_diag(&src6, &reddst6, &redtgt6)));
 		goto bad;
 	}
 
 	is_router = is_onlink = 0;
 	if (IN6_IS_ADDR_LINKLOCAL(&redtgt6))
 		is_router = 1;	/* router case */
 	if (bcmp(&redtgt6, &reddst6, sizeof(redtgt6)) == 0)
 		is_onlink = 1;	/* on-link destination case */
 	if (!is_router && !is_onlink) {
 		nd6log((LOG_ERR,
 		    "ICMP6 redirect rejected; "
 		    "neither router case nor onlink case: %s\n",
 		    icmp6_redirect_diag(&src6, &reddst6, &redtgt6)));
 		goto bad;
 	}
 	/* validation passed */
 
 	icmp6len -= sizeof(*nd_rd);
 	nd6_option_init(nd_rd + 1, icmp6len, &ndopts);
 	if (nd6_options(&ndopts) < 0) {
 		nd6log((LOG_INFO, "icmp6_redirect_input: "
 		    "invalid ND option, rejected: %s\n",
 		    icmp6_redirect_diag(&src6, &reddst6, &redtgt6)));
 		/* nd6_options have incremented stats */
 		goto freeit;
 	}
 
 	if (ndopts.nd_opts_tgt_lladdr) {
 		lladdr = (char *)(ndopts.nd_opts_tgt_lladdr + 1);
 		lladdrlen = ndopts.nd_opts_tgt_lladdr->nd_opt_len << 3;
 	}
 
 	if (ndopts.nd_opts_rh) {
 		redirhdrlen = ndopts.nd_opts_rh->nd_opt_rh_len;
 		redirhdr = (u_char *)(ndopts.nd_opts_rh + 1); /* xxx */
 	}
 
 	if (lladdr && ((ifp->if_addrlen + 2 + 7) & ~7) != lladdrlen) {
 		nd6log((LOG_INFO,
 		    "icmp6_redirect_input: lladdrlen mismatch for %s "
 		    "(if %d, icmp6 packet %d): %s\n",
 		    ip6_sprintf(ip6buf, &redtgt6),
 		    ifp->if_addrlen, lladdrlen - 2,
 		    icmp6_redirect_diag(&src6, &reddst6, &redtgt6)));
 		goto bad;
 	}
 
 	/* RFC 2461 8.3 */
 	nd6_cache_lladdr(ifp, &redtgt6, lladdr, lladdrlen, ND_REDIRECT,
 	    is_onlink ? ND_REDIRECT_ONLINK : ND_REDIRECT_ROUTER);
 
 	if (!is_onlink) {	/* better router case.  perform rtredirect. */
 		/* perform rtredirect */
 		struct sockaddr_in6 sdst;
 		struct sockaddr_in6 sgw;
 		struct sockaddr_in6 ssrc;
 
 		bzero(&sdst, sizeof(sdst));
 		bzero(&sgw, sizeof(sgw));
 		bzero(&ssrc, sizeof(ssrc));
 		sdst.sin6_family = sgw.sin6_family = ssrc.sin6_family = AF_INET6;
 		sdst.sin6_len = sgw.sin6_len = ssrc.sin6_len =
 			sizeof(struct sockaddr_in6);
 		bcopy(&redtgt6, &sgw.sin6_addr, sizeof(struct in6_addr));
 		bcopy(&reddst6, &sdst.sin6_addr, sizeof(struct in6_addr));
 		bcopy(&src6, &ssrc.sin6_addr, sizeof(struct in6_addr));
 		rtredirect((struct sockaddr *)&sdst, (struct sockaddr *)&sgw,
 		    (struct sockaddr *)NULL, RTF_GATEWAY | RTF_HOST,
 		    (struct sockaddr *)&ssrc);
 	}
 	/* finally update cached route in each socket via pfctlinput */
     {
 	struct sockaddr_in6 sdst;
 
 	bzero(&sdst, sizeof(sdst));
 	sdst.sin6_family = AF_INET6;
 	sdst.sin6_len = sizeof(struct sockaddr_in6);
 	bcopy(&reddst6, &sdst.sin6_addr, sizeof(struct in6_addr));
 	pfctlinput(PRC_REDIRECT_HOST, (struct sockaddr *)&sdst);
 #ifdef IPSEC
 	key_sa_routechange((struct sockaddr *)&sdst);
 #endif /* IPSEC */
     }
 
  freeit:
 	m_freem(m);
 	return;
 
  bad:
 	V_icmp6stat.icp6s_badredirect++;
 	m_freem(m);
 }
 
 void
 icmp6_redirect_output(struct mbuf *m0, struct rtentry *rt)
 {
 	INIT_VNET_INET6(curvnet);
 	struct ifnet *ifp;	/* my outgoing interface */
 	struct in6_addr *ifp_ll6;
 	struct in6_addr *router_ll6;
 	struct ip6_hdr *sip6;	/* m0 as struct ip6_hdr */
 	struct mbuf *m = NULL;	/* newly allocated one */
 	struct ip6_hdr *ip6;	/* m as struct ip6_hdr */
 	struct nd_redirect *nd_rd;
 	struct llentry *ln = NULL;
 	size_t maxlen;
 	u_char *p;
 	struct ifnet *outif = NULL;
 	struct sockaddr_in6 src_sa;
 
 	icmp6_errcount(&V_icmp6stat.icp6s_outerrhist, ND_REDIRECT, 0);
 
 	/* if we are not router, we don't send icmp6 redirect */
 	if (!V_ip6_forwarding)
 		goto fail;
 
 	/* sanity check */
 	if (!m0 || !rt || !(rt->rt_flags & RTF_UP) || !(ifp = rt->rt_ifp))
 		goto fail;
 
 	/*
 	 * Address check:
 	 *  the source address must identify a neighbor, and
 	 *  the destination address must not be a multicast address
 	 *  [RFC 2461, sec 8.2]
 	 */
 	sip6 = mtod(m0, struct ip6_hdr *);
 	bzero(&src_sa, sizeof(src_sa));
 	src_sa.sin6_family = AF_INET6;
 	src_sa.sin6_len = sizeof(src_sa);
 	src_sa.sin6_addr = sip6->ip6_src;
 	if (nd6_is_addr_neighbor(&src_sa, ifp) == 0)
 		goto fail;
 	if (IN6_IS_ADDR_MULTICAST(&sip6->ip6_dst))
 		goto fail;	/* what should we do here? */
 
 	/* rate limit */
 	if (icmp6_ratelimit(&sip6->ip6_src, ND_REDIRECT, 0))
 		goto fail;
 
 	/*
 	 * Since we are going to append up to 1280 bytes (= IPV6_MMTU),
 	 * we almost always ask for an mbuf cluster for simplicity.
 	 * (MHLEN < IPV6_MMTU is almost always true)
 	 */
 #if IPV6_MMTU >= MCLBYTES
 # error assumption failed about IPV6_MMTU and MCLBYTES
 #endif
 	MGETHDR(m, M_DONTWAIT, MT_HEADER);
 	if (m && IPV6_MMTU >= MHLEN)
 		MCLGET(m, M_DONTWAIT);
 	if (!m)
 		goto fail;
 	m->m_pkthdr.rcvif = NULL;
 	m->m_len = 0;
 	maxlen = M_TRAILINGSPACE(m);
 	maxlen = min(IPV6_MMTU, maxlen);
 	/* just for safety */
 	if (maxlen < sizeof(struct ip6_hdr) + sizeof(struct icmp6_hdr) +
 	    ((sizeof(struct nd_opt_hdr) + ifp->if_addrlen + 7) & ~7)) {
 		goto fail;
 	}
 
 	{
 		/* get ip6 linklocal address for ifp(my outgoing interface). */
 		struct in6_ifaddr *ia;
 		if ((ia = in6ifa_ifpforlinklocal(ifp,
 						 IN6_IFF_NOTREADY|
 						 IN6_IFF_ANYCAST)) == NULL)
 			goto fail;
 		ifp_ll6 = &ia->ia_addr.sin6_addr;
 	}
 
 	/* get ip6 linklocal address for the router. */
 	if (rt->rt_gateway && (rt->rt_flags & RTF_GATEWAY)) {
 		struct sockaddr_in6 *sin6;
 		sin6 = (struct sockaddr_in6 *)rt->rt_gateway;
 		router_ll6 = &sin6->sin6_addr;
 		if (!IN6_IS_ADDR_LINKLOCAL(router_ll6))
 			router_ll6 = (struct in6_addr *)NULL;
 	} else
 		router_ll6 = (struct in6_addr *)NULL;
 
 	/* ip6 */
 	ip6 = mtod(m, struct ip6_hdr *);
 	ip6->ip6_flow = 0;
 	ip6->ip6_vfc &= ~IPV6_VERSION_MASK;
 	ip6->ip6_vfc |= IPV6_VERSION;
 	/* ip6->ip6_plen will be set later */
 	ip6->ip6_nxt = IPPROTO_ICMPV6;
 	ip6->ip6_hlim = 255;
 	/* ip6->ip6_src must be linklocal addr for my outgoing if. */
 	bcopy(ifp_ll6, &ip6->ip6_src, sizeof(struct in6_addr));
 	bcopy(&sip6->ip6_src, &ip6->ip6_dst, sizeof(struct in6_addr));
 
 	/* ND Redirect */
 	nd_rd = (struct nd_redirect *)(ip6 + 1);
 	nd_rd->nd_rd_type = ND_REDIRECT;
 	nd_rd->nd_rd_code = 0;
 	nd_rd->nd_rd_reserved = 0;
 	if (rt->rt_flags & RTF_GATEWAY) {
 		/*
 		 * nd_rd->nd_rd_target must be a link-local address in
 		 * better router cases.
 		 */
 		if (!router_ll6)
 			goto fail;
 		bcopy(router_ll6, &nd_rd->nd_rd_target,
 		    sizeof(nd_rd->nd_rd_target));
 		bcopy(&sip6->ip6_dst, &nd_rd->nd_rd_dst,
 		    sizeof(nd_rd->nd_rd_dst));
 	} else {
 		/* make sure redtgt == reddst */
 		bcopy(&sip6->ip6_dst, &nd_rd->nd_rd_target,
 		    sizeof(nd_rd->nd_rd_target));
 		bcopy(&sip6->ip6_dst, &nd_rd->nd_rd_dst,
 		    sizeof(nd_rd->nd_rd_dst));
 	}
 
 	p = (u_char *)(nd_rd + 1);
 
 	if (!router_ll6)
 		goto nolladdropt;
 
 	{
 		/* target lladdr option */
 		int len;
 		struct nd_opt_hdr *nd_opt;
 		char *lladdr;
 
 		IF_AFDATA_LOCK(ifp);
 		ln = nd6_lookup(router_ll6, 0, ifp);
 		IF_AFDATA_UNLOCK(ifp);
 		if (ln == NULL)
 			goto nolladdropt;
 
 		len = sizeof(*nd_opt) + ifp->if_addrlen;
 		len = (len + 7) & ~7;	/* round by 8 */
 		/* safety check */
 		if (len + (p - (u_char *)ip6) > maxlen) 			
 			goto nolladdropt;
 
 		if (ln->la_flags & LLE_VALID) {
 			nd_opt = (struct nd_opt_hdr *)p;
 			nd_opt->nd_opt_type = ND_OPT_TARGET_LINKADDR;
 			nd_opt->nd_opt_len = len >> 3;
 			lladdr = (char *)(nd_opt + 1);
 			bcopy(&ln->ll_addr, lladdr, ifp->if_addrlen);
 			p += len;
 		}
 	}
 nolladdropt:
 	if (ln != NULL)
 		LLE_RUNLOCK(ln);
 		
 	m->m_pkthdr.len = m->m_len = p - (u_char *)ip6;
 
 	/* just to be safe */
 #ifdef M_DECRYPTED	/*not openbsd*/
 	if (m0->m_flags & M_DECRYPTED)
 		goto noredhdropt;
 #endif
 	if (p - (u_char *)ip6 > maxlen)
 		goto noredhdropt;
 
 	{
 		/* redirected header option */
 		int len;
 		struct nd_opt_rd_hdr *nd_opt_rh;
 
 		/*
 		 * compute the maximum size for icmp6 redirect header option.
 		 * XXX room for auth header?
 		 */
 		len = maxlen - (p - (u_char *)ip6);
 		len &= ~7;
 
 		/* This is just for simplicity. */
 		if (m0->m_pkthdr.len != m0->m_len) {
 			if (m0->m_next) {
 				m_freem(m0->m_next);
 				m0->m_next = NULL;
 			}
 			m0->m_pkthdr.len = m0->m_len;
 		}
 
 		/*
 		 * Redirected header option spec (RFC2461 4.6.3) talks nothing
 		 * about padding/truncate rule for the original IP packet.
 		 * From the discussion on IPv6imp in Feb 1999,
 		 * the consensus was:
 		 * - "attach as much as possible" is the goal
 		 * - pad if not aligned (original size can be guessed by
 		 *   original ip6 header)
 		 * Following code adds the padding if it is simple enough,
 		 * and truncates if not.
 		 */
 		if (m0->m_next || m0->m_pkthdr.len != m0->m_len)
 			panic("assumption failed in %s:%d", __FILE__,
 			    __LINE__);
 
 		if (len - sizeof(*nd_opt_rh) < m0->m_pkthdr.len) {
 			/* not enough room, truncate */
 			m0->m_pkthdr.len = m0->m_len = len -
 			    sizeof(*nd_opt_rh);
 		} else {
 			/* enough room, pad or truncate */
 			size_t extra;
 
 			extra = m0->m_pkthdr.len % 8;
 			if (extra) {
 				/* pad if easy enough, truncate if not */
 				if (8 - extra <= M_TRAILINGSPACE(m0)) {
 					/* pad */
 					m0->m_len += (8 - extra);
 					m0->m_pkthdr.len += (8 - extra);
 				} else {
 					/* truncate */
 					m0->m_pkthdr.len -= extra;
 					m0->m_len -= extra;
 				}
 			}
 			len = m0->m_pkthdr.len + sizeof(*nd_opt_rh);
 			m0->m_pkthdr.len = m0->m_len = len -
 			    sizeof(*nd_opt_rh);
 		}
 
 		nd_opt_rh = (struct nd_opt_rd_hdr *)p;
 		bzero(nd_opt_rh, sizeof(*nd_opt_rh));
 		nd_opt_rh->nd_opt_rh_type = ND_OPT_REDIRECTED_HEADER;
 		nd_opt_rh->nd_opt_rh_len = len >> 3;
 		p += sizeof(*nd_opt_rh);
 		m->m_pkthdr.len = m->m_len = p - (u_char *)ip6;
 
 		/* connect m0 to m */
 		m_tag_delete_chain(m0, NULL);
 		m0->m_flags &= ~M_PKTHDR;
 		m->m_next = m0;
 		m->m_pkthdr.len = m->m_len + m0->m_len;
 		m0 = NULL;
 	}
 noredhdropt:;
 	if (m0) {
 		m_freem(m0);
 		m0 = NULL;
 	}
 
 	/* XXX: clear embedded link IDs in the inner header */
 	in6_clearscope(&sip6->ip6_src);
 	in6_clearscope(&sip6->ip6_dst);
 	in6_clearscope(&nd_rd->nd_rd_target);
 	in6_clearscope(&nd_rd->nd_rd_dst);
 
 	ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(struct ip6_hdr));
 
 	nd_rd->nd_rd_cksum = 0;
 	nd_rd->nd_rd_cksum = in6_cksum(m, IPPROTO_ICMPV6,
 	    sizeof(*ip6), ntohs(ip6->ip6_plen));
 
 	/* send the packet to outside... */
 	ip6_output(m, NULL, NULL, 0, NULL, &outif, NULL);
 	if (outif) {
 		icmp6_ifstat_inc(outif, ifs6_out_msg);
 		icmp6_ifstat_inc(outif, ifs6_out_redirect);
 	}
 	V_icmp6stat.icp6s_outhist[ND_REDIRECT]++;
 
 	return;
 
 fail:
 	if (m)
 		m_freem(m);
 	if (m0)
 		m_freem(m0);
 }
 
 /*
  * ICMPv6 socket option processing.
  */
 int
 icmp6_ctloutput(struct socket *so, struct sockopt *sopt)
 {
 	int error = 0;
 	int optlen;
 	struct inpcb *inp = sotoinpcb(so);
 	int level, op, optname;
 
 	if (sopt) {
 		level = sopt->sopt_level;
 		op = sopt->sopt_dir;
 		optname = sopt->sopt_name;
 		optlen = sopt->sopt_valsize;
 	} else
 		level = op = optname = optlen = 0;
 
 	if (level != IPPROTO_ICMPV6) {
 		return EINVAL;
 	}
 
 	switch (op) {
 	case PRCO_SETOPT:
 		switch (optname) {
 		case ICMP6_FILTER:
 		    {
 			struct icmp6_filter ic6f;
 
 			if (optlen != sizeof(ic6f)) {
 				error = EMSGSIZE;
 				break;
 			}
 			error = sooptcopyin(sopt, &ic6f, optlen, optlen);
 			if (error == 0) {
 				INP_WLOCK(inp);
 				*inp->in6p_icmp6filt = ic6f;
 				INP_WUNLOCK(inp);
 			}
 			break;
 		    }
 
 		default:
 			error = ENOPROTOOPT;
 			break;
 		}
 		break;
 
 	case PRCO_GETOPT:
 		switch (optname) {
 		case ICMP6_FILTER:
 		    {
 			struct icmp6_filter ic6f;
 
 			INP_RLOCK(inp);
 			ic6f = *inp->in6p_icmp6filt;
 			INP_RUNLOCK(inp);
 			error = sooptcopyout(sopt, &ic6f, sizeof(ic6f));
 			break;
 		    }
 
 		default:
 			error = ENOPROTOOPT;
 			break;
 		}
 		break;
 	}
 
 	return (error);
 }
 
 /*
  * Perform rate limit check.
  * Returns 0 if it is okay to send the icmp6 packet.
  * Returns 1 if the router SHOULD NOT send this icmp6 packet due to rate
  * limitation.
  *
  * XXX per-destination/type check necessary?
  *
  * dst - not used at this moment
  * type - not used at this moment
  * code - not used at this moment
  */
 static int
 icmp6_ratelimit(const struct in6_addr *dst, const int type,
     const int code)
 {
 	INIT_VNET_INET6(curvnet);
 	int ret;
 
 	ret = 0;	/* okay to send */
 
 	/* PPS limit */
 	if (!ppsratecheck(&V_icmp6errppslim_last, &V_icmp6errpps_count,
 	    V_icmp6errppslim)) {
 		/* The packet is subject to rate limit */
 		ret++;
 	}
 
 	return ret;
 }
Index: head/sys/netinet6/ip6_output.c
===================================================================
--- head/sys/netinet6/ip6_output.c	(revision 186221)
+++ head/sys/netinet6/ip6_output.c	(revision 186222)
@@ -1,3348 +1,3348 @@
 /*-
  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the project nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	$KAME: ip6_output.c,v 1.279 2002/01/26 06:12:30 jinmei Exp $
  */
 
 /*-
  * Copyright (c) 1982, 1986, 1988, 1990, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)ip_output.c	8.3 (Berkeley) 1/21/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_ipsec.h"
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/errno.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/protosw.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/ucred.h>
 #include <sys/vimage.h>
 
 #include <net/if.h>
 #include <net/netisr.h>
 #include <net/route.h>
 #include <net/pfil.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_var.h>
 #include <netinet6/in6_var.h>
 #include <netinet/ip6.h>
 #include <netinet/icmp6.h>
 #include <netinet6/ip6_var.h>
 #include <netinet/in_pcb.h>
 #include <netinet/tcp_var.h>
 #include <netinet6/nd6.h>
 #include <netinet/vinet.h>
 
 #ifdef IPSEC
 #include <netipsec/ipsec.h>
 #include <netipsec/ipsec6.h>
 #include <netipsec/key.h>
 #include <netinet6/ip6_ipsec.h>
 #endif /* IPSEC */
 
 #include <netinet6/ip6protosw.h>
 #include <netinet6/scope6_var.h>
 #include <netinet6/vinet6.h>
 
 static MALLOC_DEFINE(M_IP6MOPTS, "ip6_moptions", "internet multicast options");
 
 struct ip6_exthdrs {
 	struct mbuf *ip6e_ip6;
 	struct mbuf *ip6e_hbh;
 	struct mbuf *ip6e_dest1;
 	struct mbuf *ip6e_rthdr;
 	struct mbuf *ip6e_dest2;
 };
 
 static int ip6_pcbopt __P((int, u_char *, int, struct ip6_pktopts **,
 			   struct ucred *, int));
 static int ip6_pcbopts __P((struct ip6_pktopts **, struct mbuf *,
 	struct socket *, struct sockopt *));
 static int ip6_getpcbopt(struct ip6_pktopts *, int, struct sockopt *);
 static int ip6_setpktopt __P((int, u_char *, int, struct ip6_pktopts *,
 	struct ucred *, int, int, int));
 
 static int ip6_setmoptions(int, struct ip6_moptions **, struct mbuf *);
 static int ip6_getmoptions(int, struct ip6_moptions *, struct mbuf **);
 static int ip6_copyexthdr(struct mbuf **, caddr_t, int);
 static int ip6_insertfraghdr __P((struct mbuf *, struct mbuf *, int,
 	struct ip6_frag **));
 static int ip6_insert_jumboopt(struct ip6_exthdrs *, u_int32_t);
 static int ip6_splithdr(struct mbuf *, struct ip6_exthdrs *);
 static int ip6_getpmtu __P((struct route_in6 *, struct route_in6 *,
 	struct ifnet *, struct in6_addr *, u_long *, int *));
 static int copypktopts(struct ip6_pktopts *, struct ip6_pktopts *, int);
 
 
 /*
  * Make an extension header from option data.  hp is the source, and
  * mp is the destination.
  */
 #define MAKE_EXTHDR(hp, mp)						\
     do {								\
 	if (hp) {							\
 		struct ip6_ext *eh = (struct ip6_ext *)(hp);		\
 		error = ip6_copyexthdr((mp), (caddr_t)(hp),		\
 		    ((eh)->ip6e_len + 1) << 3);				\
 		if (error)						\
 			goto freehdrs;					\
 	}								\
     } while (/*CONSTCOND*/ 0)
 
 /*
  * Form a chain of extension headers.
  * m is the extension header mbuf
  * mp is the previous mbuf in the chain
  * p is the next header
  * i is the type of option.
  */
 #define MAKE_CHAIN(m, mp, p, i)\
     do {\
 	if (m) {\
 		if (!hdrsplit) \
 			panic("assumption failed: hdr not split"); \
 		*mtod((m), u_char *) = *(p);\
 		*(p) = (i);\
 		p = mtod((m), u_char *);\
 		(m)->m_next = (mp)->m_next;\
 		(mp)->m_next = (m);\
 		(mp) = (m);\
 	}\
     } while (/*CONSTCOND*/ 0)
 
 /*
  * IP6 output. The packet in mbuf chain m contains a skeletal IP6
  * header (with pri, len, nxt, hlim, src, dst).
  * This function may modify ver and hlim only.
  * The mbuf chain containing the packet will be freed.
  * The mbuf opt, if present, will not be freed.
  *
  * type of "mtu": rt_rmx.rmx_mtu is u_long, ifnet.ifr_mtu is int, and
  * nd_ifinfo.linkmtu is u_int32_t.  so we use u_long to hold largest one,
  * which is rt_rmx.rmx_mtu.
  *
  * ifpp - XXX: just for statistics
  */
 int
 ip6_output(struct mbuf *m0, struct ip6_pktopts *opt,
     struct route_in6 *ro, int flags, struct ip6_moptions *im6o,
     struct ifnet **ifpp, struct inpcb *inp)
 {
 	INIT_VNET_NET(curvnet);
 	INIT_VNET_INET6(curvnet);
 	struct ip6_hdr *ip6, *mhip6;
 	struct ifnet *ifp, *origifp;
 	struct mbuf *m = m0;
 	struct mbuf *mprev = NULL;
 	int hlen, tlen, len, off;
 	struct route_in6 ip6route;
 	struct rtentry *rt = NULL;
 	struct sockaddr_in6 *dst, src_sa, dst_sa;
 	struct in6_addr odst;
 	int error = 0;
 	struct in6_ifaddr *ia = NULL;
 	u_long mtu;
 	int alwaysfrag, dontfrag;
 	u_int32_t optlen = 0, plen = 0, unfragpartlen = 0;
 	struct ip6_exthdrs exthdrs;
 	struct in6_addr finaldst, src0, dst0;
 	u_int32_t zone;
 	struct route_in6 *ro_pmtu = NULL;
 	int hdrsplit = 0;
 	int needipsec = 0;
 #ifdef IPSEC
 	struct ipsec_output_state state;
 	struct ip6_rthdr *rh = NULL;
 	int needipsectun = 0;
 	int segleft_org = 0;
 	struct secpolicy *sp = NULL;
 #endif /* IPSEC */
 
 	ip6 = mtod(m, struct ip6_hdr *);
 	if (ip6 == NULL) {
 		printf ("ip6 is NULL");
 		goto bad;
 	}
 
 	finaldst = ip6->ip6_dst;
 
 	bzero(&exthdrs, sizeof(exthdrs));
 
 	if (opt) {
 		/* Hop-by-Hop options header */
 		MAKE_EXTHDR(opt->ip6po_hbh, &exthdrs.ip6e_hbh);
 		/* Destination options header(1st part) */
 		if (opt->ip6po_rthdr) {
 			/*
 			 * Destination options header(1st part)
 			 * This only makes sense with a routing header.
 			 * See Section 9.2 of RFC 3542.
 			 * Disabling this part just for MIP6 convenience is
 			 * a bad idea.  We need to think carefully about a
 			 * way to make the advanced API coexist with MIP6
 			 * options, which might automatically be inserted in
 			 * the kernel.
 			 */
 			MAKE_EXTHDR(opt->ip6po_dest1, &exthdrs.ip6e_dest1);
 		}
 		/* Routing header */
 		MAKE_EXTHDR(opt->ip6po_rthdr, &exthdrs.ip6e_rthdr);
 		/* Destination options header(2nd part) */
 		MAKE_EXTHDR(opt->ip6po_dest2, &exthdrs.ip6e_dest2);
 	}
 
 	/*
 	 * IPSec checking which handles several cases.
 	 * FAST IPSEC: We re-injected the packet.
 	 */
 #ifdef IPSEC
 	switch(ip6_ipsec_output(&m, inp, &flags, &error, &ifp, &sp))
 	{
 	case 1:                 /* Bad packet */
 		goto freehdrs;
 	case -1:                /* Do IPSec */
 		needipsec = 1;
 	case 0:                 /* No IPSec */
 	default:
 		break;
 	}
 #endif /* IPSEC */
 
 	/*
 	 * Calculate the total length of the extension header chain.
 	 * Keep the length of the unfragmentable part for fragmentation.
 	 */
 	optlen = 0;
 	if (exthdrs.ip6e_hbh)
 		optlen += exthdrs.ip6e_hbh->m_len;
 	if (exthdrs.ip6e_dest1)
 		optlen += exthdrs.ip6e_dest1->m_len;
 	if (exthdrs.ip6e_rthdr)
 		optlen += exthdrs.ip6e_rthdr->m_len;
 	unfragpartlen = optlen + sizeof(struct ip6_hdr);
 
 	/* NOTE: we don't add AH/ESP length here. do that later. */
 	if (exthdrs.ip6e_dest2)
 		optlen += exthdrs.ip6e_dest2->m_len;
 
 	/*
 	 * If we need IPsec, or there is at least one extension header,
 	 * separate IP6 header from the payload.
 	 */
 	if ((needipsec || optlen) && !hdrsplit) {
 		if ((error = ip6_splithdr(m, &exthdrs)) != 0) {
 			m = NULL;
 			goto freehdrs;
 		}
 		m = exthdrs.ip6e_ip6;
 		hdrsplit++;
 	}
 
 	/* adjust pointer */
 	ip6 = mtod(m, struct ip6_hdr *);
 
 	/* adjust mbuf packet header length */
 	m->m_pkthdr.len += optlen;
 	plen = m->m_pkthdr.len - sizeof(*ip6);
 
 	/* If this is a jumbo payload, insert a jumbo payload option. */
 	if (plen > IPV6_MAXPACKET) {
 		if (!hdrsplit) {
 			if ((error = ip6_splithdr(m, &exthdrs)) != 0) {
 				m = NULL;
 				goto freehdrs;
 			}
 			m = exthdrs.ip6e_ip6;
 			hdrsplit++;
 		}
 		/* adjust pointer */
 		ip6 = mtod(m, struct ip6_hdr *);
 		if ((error = ip6_insert_jumboopt(&exthdrs, plen)) != 0)
 			goto freehdrs;
 		ip6->ip6_plen = 0;
 	} else
 		ip6->ip6_plen = htons(plen);
 
 	/*
 	 * Concatenate headers and fill in next header fields.
 	 * Here we have, on "m"
 	 *	IPv6 payload
 	 * and we insert headers accordingly.  Finally, we should be getting:
 	 *	IPv6 hbh dest1 rthdr ah* [esp* dest2 payload]
 	 *
 	 * during the header composing process, "m" points to IPv6 header.
 	 * "mprev" points to an extension header prior to esp.
 	 */
 	u_char *nexthdrp = &ip6->ip6_nxt;
 	mprev = m;
 
 	/*
 	 * we treat dest2 specially.  this makes IPsec processing
 	 * much easier.  the goal here is to make mprev point the
 	 * mbuf prior to dest2.
 	 *
 	 * result: IPv6 dest2 payload
 	 * m and mprev will point to IPv6 header.
 	 */
 	if (exthdrs.ip6e_dest2) {
 		if (!hdrsplit)
 			panic("assumption failed: hdr not split");
 		exthdrs.ip6e_dest2->m_next = m->m_next;
 		m->m_next = exthdrs.ip6e_dest2;
 		*mtod(exthdrs.ip6e_dest2, u_char *) = ip6->ip6_nxt;
 		ip6->ip6_nxt = IPPROTO_DSTOPTS;
 	}
 
 	/*
 	 * result: IPv6 hbh dest1 rthdr dest2 payload
 	 * m will point to IPv6 header.  mprev will point to the
 	 * extension header prior to dest2 (rthdr in the above case).
 	 */
 	MAKE_CHAIN(exthdrs.ip6e_hbh, mprev, nexthdrp, IPPROTO_HOPOPTS);
 	MAKE_CHAIN(exthdrs.ip6e_dest1, mprev, nexthdrp,
 		   IPPROTO_DSTOPTS);
 	MAKE_CHAIN(exthdrs.ip6e_rthdr, mprev, nexthdrp,
 		   IPPROTO_ROUTING);
 
 #ifdef IPSEC
 	if (!needipsec)
 		goto skip_ipsec2;
 
 	/*
 	 * pointers after IPsec headers are not valid any more.
 	 * other pointers need a great care too.
 	 * (IPsec routines should not mangle mbufs prior to AH/ESP)
 	 */
 	exthdrs.ip6e_dest2 = NULL;
 
 	if (exthdrs.ip6e_rthdr) {
 		rh = mtod(exthdrs.ip6e_rthdr, struct ip6_rthdr *);
 		segleft_org = rh->ip6r_segleft;
 		rh->ip6r_segleft = 0;
 	}
 
 	bzero(&state, sizeof(state));
 	state.m = m;
 	error = ipsec6_output_trans(&state, nexthdrp, mprev, sp, flags,
 				    &needipsectun);
 	m = state.m;
 	if (error == EJUSTRETURN) {
 		/*
 		 * We had a SP with a level of 'use' and no SA. We
 		 * will just continue to process the packet without
 		 * IPsec processing.
 		 */
 		;
 	} else if (error) {
 		/* mbuf is already reclaimed in ipsec6_output_trans. */
 		m = NULL;
 		switch (error) {
 		case EHOSTUNREACH:
 		case ENETUNREACH:
 		case EMSGSIZE:
 		case ENOBUFS:
 		case ENOMEM:
 			break;
 		default:
 			printf("[%s:%d] (ipsec): error code %d\n",
 			    __func__, __LINE__, error);
 			/* FALLTHROUGH */
 		case ENOENT:
 			/* don't show these error codes to the user */
 			error = 0;
 			break;
 		}
 		goto bad;
 	} else if (!needipsectun) {
 		/*
 		 * In the FAST IPSec case we have already
 		 * re-injected the packet and it has been freed
 		 * by the ipsec_done() function.  So, just clean
 		 * up after ourselves.
 		 */
 		m = NULL;
 		goto done;
 	}
 	if (exthdrs.ip6e_rthdr) {
 		/* ah6_output doesn't modify mbuf chain */
 		rh->ip6r_segleft = segleft_org;
 	}
 skip_ipsec2:;
 #endif /* IPSEC */
 
 	/*
 	 * If there is a routing header, replace the destination address field
 	 * with the first hop of the routing header.
 	 */
 	if (exthdrs.ip6e_rthdr) {
 		struct ip6_rthdr *rh =
 			(struct ip6_rthdr *)(mtod(exthdrs.ip6e_rthdr,
 						  struct ip6_rthdr *));
 		struct ip6_rthdr0 *rh0;
 		struct in6_addr *addr;
 		struct sockaddr_in6 sa;
 
 		switch (rh->ip6r_type) {
 		case IPV6_RTHDR_TYPE_0:
 			 rh0 = (struct ip6_rthdr0 *)rh;
 			 addr = (struct in6_addr *)(rh0 + 1);
 
 			 /*
 			  * construct a sockaddr_in6 form of
 			  * the first hop.
 			  *
 			  * XXX: we may not have enough
 			  * information about its scope zone;
 			  * there is no standard API to pass
 			  * the information from the
 			  * application.
 			  */
 			 bzero(&sa, sizeof(sa));
 			 sa.sin6_family = AF_INET6;
 			 sa.sin6_len = sizeof(sa);
 			 sa.sin6_addr = addr[0];
 			 if ((error = sa6_embedscope(&sa,
 			     V_ip6_use_defzone)) != 0) {
 				 goto bad;
 			 }
 			 ip6->ip6_dst = sa.sin6_addr;
 			 bcopy(&addr[1], &addr[0], sizeof(struct in6_addr)
 			     * (rh0->ip6r0_segleft - 1));
 			 addr[rh0->ip6r0_segleft - 1] = finaldst;
 			 /* XXX */
 			 in6_clearscope(addr + rh0->ip6r0_segleft - 1);
 			 break;
 		default:	/* is it possible? */
 			 error = EINVAL;
 			 goto bad;
 		}
 	}
 
 	/* Source address validation */
 	if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src) &&
 	    (flags & IPV6_UNSPECSRC) == 0) {
 		error = EOPNOTSUPP;
 		V_ip6stat.ip6s_badscope++;
 		goto bad;
 	}
 	if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_src)) {
 		error = EOPNOTSUPP;
 		V_ip6stat.ip6s_badscope++;
 		goto bad;
 	}
 
 	V_ip6stat.ip6s_localout++;
 
 	/*
 	 * Route packet.
 	 */
 	if (ro == 0) {
 		ro = &ip6route;
 		bzero((caddr_t)ro, sizeof(*ro));
 	}
 	ro_pmtu = ro;
 	if (opt && opt->ip6po_rthdr)
 		ro = &opt->ip6po_route;
 	dst = (struct sockaddr_in6 *)&ro->ro_dst;
 
 again:
 	/*
 	 * if specified, try to fill in the traffic class field.
 	 * do not override if a non-zero value is already set.
 	 * we check the diffserv field and the ecn field separately.
 	 */
 	if (opt && opt->ip6po_tclass >= 0) {
 		int mask = 0;
 
 		if ((ip6->ip6_flow & htonl(0xfc << 20)) == 0)
 			mask |= 0xfc;
 		if ((ip6->ip6_flow & htonl(0x03 << 20)) == 0)
 			mask |= 0x03;
 		if (mask != 0)
 			ip6->ip6_flow |= htonl((opt->ip6po_tclass & mask) << 20);
 	}
 
 	/* fill in or override the hop limit field, if necessary. */
 	if (opt && opt->ip6po_hlim != -1)
 		ip6->ip6_hlim = opt->ip6po_hlim & 0xff;
 	else if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) {
 		if (im6o != NULL)
 			ip6->ip6_hlim = im6o->im6o_multicast_hlim;
 		else
 			ip6->ip6_hlim = V_ip6_defmcasthlim;
 	}
 
 #ifdef IPSEC
 	/*
 	 * We may re-inject packets into the stack here.
 	 */
 	if (needipsec && needipsectun) {
 		struct ipsec_output_state state;
 
 		/*
 		 * All the extension headers will become inaccessible
 		 * (since they can be encrypted).
 		 * Don't panic, we need no more updates to extension headers
 		 * on inner IPv6 packet (since they are now encapsulated).
 		 *
 		 * IPv6 [ESP|AH] IPv6 [extension headers] payload
 		 */
 		bzero(&exthdrs, sizeof(exthdrs));
 		exthdrs.ip6e_ip6 = m;
 
 		bzero(&state, sizeof(state));
 		state.m = m;
 		state.ro = (struct route *)ro;
 		state.dst = (struct sockaddr *)dst;
 
 		error = ipsec6_output_tunnel(&state, sp, flags);
 
 		m = state.m;
 		ro = (struct route_in6 *)state.ro;
 		dst = (struct sockaddr_in6 *)state.dst;
 		if (error == EJUSTRETURN) {
 			/*
 			 * We had a SP with a level of 'use' and no SA. We
 			 * will just continue to process the packet without
 			 * IPsec processing.
 			 */
 			;
 		} else if (error) {
 			/* mbuf is already reclaimed in ipsec6_output_tunnel. */
 			m0 = m = NULL;
 			m = NULL;
 			switch (error) {
 			case EHOSTUNREACH:
 			case ENETUNREACH:
 			case EMSGSIZE:
 			case ENOBUFS:
 			case ENOMEM:
 				break;
 			default:
 				printf("[%s:%d] (ipsec): error code %d\n",
 				    __func__, __LINE__, error);
 				/* FALLTHROUGH */
 			case ENOENT:
 				/* don't show these error codes to the user */
 				error = 0;
 				break;
 			}
 			goto bad;
 		} else {
 			/*
 			 * In the FAST IPSec case we have already
 			 * re-injected the packet and it has been freed
 			 * by the ipsec_done() function.  So, just clean
 			 * up after ourselves.
 			 */
 			m = NULL;
 			goto done;
 		}
 
 		exthdrs.ip6e_ip6 = m;
 	}
 #endif /* IPSEC */
 
 	/* adjust pointer */
 	ip6 = mtod(m, struct ip6_hdr *);
 
 	bzero(&dst_sa, sizeof(dst_sa));
 	dst_sa.sin6_family = AF_INET6;
 	dst_sa.sin6_len = sizeof(dst_sa);
 	dst_sa.sin6_addr = ip6->ip6_dst;
 	if ((error = in6_selectroute(&dst_sa, opt, im6o, ro,
 	    &ifp, &rt)) != 0) {
 		switch (error) {
 		case EHOSTUNREACH:
 			V_ip6stat.ip6s_noroute++;
 			break;
 		case EADDRNOTAVAIL:
 		default:
 			break; /* XXX statistics? */
 		}
 		if (ifp != NULL)
 			in6_ifstat_inc(ifp, ifs6_out_discard);
 		goto bad;
 	}
 	if (rt == NULL) {
 		/*
 		 * If in6_selectroute() does not return a route entry,
 		 * dst may not have been updated.
 		 */
 		*dst = dst_sa;	/* XXX */
 	}
 
 	/*
 	 * then rt (for unicast) and ifp must be non-NULL valid values.
 	 */
 	if ((flags & IPV6_FORWARDING) == 0) {
 		/* XXX: the FORWARDING flag can be set for mrouting. */
 		in6_ifstat_inc(ifp, ifs6_out_request);
 	}
 	if (rt != NULL) {
 		ia = (struct in6_ifaddr *)(rt->rt_ifa);
 		rt->rt_use++;
 	}
 
 	/*
 	 * The outgoing interface must be in the zone of source and
 	 * destination addresses.  We should use ia_ifp to support the
 	 * case of sending packets to an address of our own.
 	 */
 	if (ia != NULL && ia->ia_ifp)
 		origifp = ia->ia_ifp;
 	else
 		origifp = ifp;
 
 	src0 = ip6->ip6_src;
 	if (in6_setscope(&src0, origifp, &zone))
 		goto badscope;
 	bzero(&src_sa, sizeof(src_sa));
 	src_sa.sin6_family = AF_INET6;
 	src_sa.sin6_len = sizeof(src_sa);
 	src_sa.sin6_addr = ip6->ip6_src;
 	if (sa6_recoverscope(&src_sa) || zone != src_sa.sin6_scope_id)
 		goto badscope;
 
 	dst0 = ip6->ip6_dst;
 	if (in6_setscope(&dst0, origifp, &zone))
 		goto badscope;
 	/* re-initialize to be sure */
 	bzero(&dst_sa, sizeof(dst_sa));
 	dst_sa.sin6_family = AF_INET6;
 	dst_sa.sin6_len = sizeof(dst_sa);
 	dst_sa.sin6_addr = ip6->ip6_dst;
 	if (sa6_recoverscope(&dst_sa) || zone != dst_sa.sin6_scope_id) {
 		goto badscope;
 	}
 
 	/* scope check is done. */
 	goto routefound;
 
   badscope:
 	V_ip6stat.ip6s_badscope++;
 	in6_ifstat_inc(origifp, ifs6_out_discard);
 	if (error == 0)
 		error = EHOSTUNREACH; /* XXX */
 	goto bad;
 
   routefound:
 	if (rt && !IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) {
 		if (opt && opt->ip6po_nextroute.ro_rt) {
 			/*
 			 * The nexthop is explicitly specified by the
 			 * application.  We assume the next hop is an IPv6
 			 * address.
 			 */
 			dst = (struct sockaddr_in6 *)opt->ip6po_nexthop;
 		}
 		else if ((rt->rt_flags & RTF_GATEWAY))
 			dst = (struct sockaddr_in6 *)rt->rt_gateway;
 	}
 
 	if (!IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) {
 		m->m_flags &= ~(M_BCAST | M_MCAST); /* just in case */
 	} else {
 		struct	in6_multi *in6m;
 
 		m->m_flags = (m->m_flags & ~M_BCAST) | M_MCAST;
 
 		in6_ifstat_inc(ifp, ifs6_out_mcast);
 
 		/*
 		 * Confirm that the outgoing interface supports multicast.
 		 */
 		if (!(ifp->if_flags & IFF_MULTICAST)) {
 			V_ip6stat.ip6s_noroute++;
 			in6_ifstat_inc(ifp, ifs6_out_discard);
 			error = ENETUNREACH;
 			goto bad;
 		}
 		IN6_LOOKUP_MULTI(ip6->ip6_dst, ifp, in6m);
 		if (in6m != NULL &&
 		   (im6o == NULL || im6o->im6o_multicast_loop)) {
 			/*
 			 * If we belong to the destination multicast group
 			 * on the outgoing interface, and the caller did not
 			 * forbid loopback, loop back a copy.
 			 */
 			ip6_mloopback(ifp, m, dst);
 		} else {
 			/*
 			 * If we are acting as a multicast router, perform
 			 * multicast forwarding as if the packet had just
 			 * arrived on the interface to which we are about
 			 * to send.  The multicast forwarding function
 			 * recursively calls this function, using the
 			 * IPV6_FORWARDING flag to prevent infinite recursion.
 			 *
 			 * Multicasts that are looped back by ip6_mloopback(),
 			 * above, will be forwarded by the ip6_input() routine,
 			 * if necessary.
 			 */
 			if (ip6_mrouter && (flags & IPV6_FORWARDING) == 0) {
 				/*
 				 * XXX: ip6_mforward expects that rcvif is NULL
 				 * when it is called from the originating path.
 				 * However, it is not always the case, since
 				 * some versions of MGETHDR() does not
 				 * initialize the field.
 				 */
 				m->m_pkthdr.rcvif = NULL;
 				if (ip6_mforward(ip6, ifp, m) != 0) {
 					m_freem(m);
 					goto done;
 				}
 			}
 		}
 		/*
 		 * Multicasts with a hoplimit of zero may be looped back,
 		 * above, but must not be transmitted on a network.
 		 * Also, multicasts addressed to the loopback interface
 		 * are not sent -- the above call to ip6_mloopback() will
 		 * loop back a copy if this host actually belongs to the
 		 * destination group on the loopback interface.
 		 */
 		if (ip6->ip6_hlim == 0 || (ifp->if_flags & IFF_LOOPBACK) ||
 		    IN6_IS_ADDR_MC_INTFACELOCAL(&ip6->ip6_dst)) {
 			m_freem(m);
 			goto done;
 		}
 	}
 
 	/*
 	 * Fill the outgoing inteface to tell the upper layer
 	 * to increment per-interface statistics.
 	 */
 	if (ifpp)
 		*ifpp = ifp;
 
 	/* Determine path MTU. */
 	if ((error = ip6_getpmtu(ro_pmtu, ro, ifp, &finaldst, &mtu,
 	    &alwaysfrag)) != 0)
 		goto bad;
 
 	/*
 	 * The caller of this function may specify to use the minimum MTU
 	 * in some cases.
 	 * An advanced API option (IPV6_USE_MIN_MTU) can also override MTU
 	 * setting.  The logic is a bit complicated; by default, unicast
 	 * packets will follow path MTU while multicast packets will be sent at
 	 * the minimum MTU.  If IP6PO_MINMTU_ALL is specified, all packets
 	 * including unicast ones will be sent at the minimum MTU.  Multicast
 	 * packets will always be sent at the minimum MTU unless
 	 * IP6PO_MINMTU_DISABLE is explicitly specified.
 	 * See RFC 3542 for more details.
 	 */
 	if (mtu > IPV6_MMTU) {
 		if ((flags & IPV6_MINMTU))
 			mtu = IPV6_MMTU;
 		else if (opt && opt->ip6po_minmtu == IP6PO_MINMTU_ALL)
 			mtu = IPV6_MMTU;
 		else if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) &&
 			 (opt == NULL ||
 			  opt->ip6po_minmtu != IP6PO_MINMTU_DISABLE)) {
 			mtu = IPV6_MMTU;
 		}
 	}
 
 	/*
 	 * clear embedded scope identifiers if necessary.
 	 * in6_clearscope will touch the addresses only when necessary.
 	 */
 	in6_clearscope(&ip6->ip6_src);
 	in6_clearscope(&ip6->ip6_dst);
 
 	/*
 	 * If the outgoing packet contains a hop-by-hop options header,
 	 * it must be examined and processed even by the source node.
 	 * (RFC 2460, section 4.)
 	 */
 	if (exthdrs.ip6e_hbh) {
 		struct ip6_hbh *hbh = mtod(exthdrs.ip6e_hbh, struct ip6_hbh *);
 		u_int32_t dummy; /* XXX unused */
 		u_int32_t plen = 0; /* XXX: ip6_process will check the value */
 
 #ifdef DIAGNOSTIC
 		if ((hbh->ip6h_len + 1) << 3 > exthdrs.ip6e_hbh->m_len)
 			panic("ip6e_hbh is not continuous");
 #endif
 		/*
 		 *  XXX: if we have to send an ICMPv6 error to the sender,
 		 *       we need the M_LOOP flag since icmp6_error() expects
 		 *       the IPv6 and the hop-by-hop options header are
 		 *       continuous unless the flag is set.
 		 */
 		m->m_flags |= M_LOOP;
 		m->m_pkthdr.rcvif = ifp;
 		if (ip6_process_hopopts(m, (u_int8_t *)(hbh + 1),
 		    ((hbh->ip6h_len + 1) << 3) - sizeof(struct ip6_hbh),
 		    &dummy, &plen) < 0) {
 			/* m was already freed at this point */
 			error = EINVAL;/* better error? */
 			goto done;
 		}
 		m->m_flags &= ~M_LOOP; /* XXX */
 		m->m_pkthdr.rcvif = NULL;
 	}
 
 	/* Jump over all PFIL processing if hooks are not active. */
 	if (!PFIL_HOOKED(&inet6_pfil_hook))
 		goto passout;
 
 	odst = ip6->ip6_dst;
 	/* Run through list of hooks for output packets. */
 	error = pfil_run_hooks(&inet6_pfil_hook, &m, ifp, PFIL_OUT, inp);
 	if (error != 0 || m == NULL)
 		goto done;
 	ip6 = mtod(m, struct ip6_hdr *);
 
 	/* See if destination IP address was changed by packet filter. */
 	if (!IN6_ARE_ADDR_EQUAL(&odst, &ip6->ip6_dst)) {
 		m->m_flags |= M_SKIP_FIREWALL;
 		/* If destination is now ourself drop to ip6_input(). */
 		if (in6_localaddr(&ip6->ip6_dst)) {
 			if (m->m_pkthdr.rcvif == NULL)
 				m->m_pkthdr.rcvif = V_loif;
 			if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
 				m->m_pkthdr.csum_flags |=
 				    CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
 				m->m_pkthdr.csum_data = 0xffff;
 			}
 			m->m_pkthdr.csum_flags |=
 			    CSUM_IP_CHECKED | CSUM_IP_VALID;
 			error = netisr_queue(NETISR_IPV6, m);
 			goto done;
 		} else
 			goto again;	/* Redo the routing table lookup. */
 	}
 
 	/* XXX: IPFIREWALL_FORWARD */
 
 passout:
 	/*
 	 * Send the packet to the outgoing interface.
 	 * If necessary, do IPv6 fragmentation before sending.
 	 *
 	 * the logic here is rather complex:
 	 * 1: normal case (dontfrag == 0, alwaysfrag == 0)
 	 * 1-a:	send as is if tlen <= path mtu
 	 * 1-b:	fragment if tlen > path mtu
 	 *
 	 * 2: if user asks us not to fragment (dontfrag == 1)
 	 * 2-a:	send as is if tlen <= interface mtu
 	 * 2-b:	error if tlen > interface mtu
 	 *
 	 * 3: if we always need to attach fragment header (alwaysfrag == 1)
 	 *	always fragment
 	 *
 	 * 4: if dontfrag == 1 && alwaysfrag == 1
 	 *	error, as we cannot handle this conflicting request
 	 */
 	tlen = m->m_pkthdr.len;
 
 	if (opt && (opt->ip6po_flags & IP6PO_DONTFRAG))
 		dontfrag = 1;
 	else
 		dontfrag = 0;
 	if (dontfrag && alwaysfrag) {	/* case 4 */
 		/* conflicting request - can't transmit */
 		error = EMSGSIZE;
 		goto bad;
 	}
 	if (dontfrag && tlen > IN6_LINKMTU(ifp)) {	/* case 2-b */
 		/*
 		 * Even if the DONTFRAG option is specified, we cannot send the
 		 * packet when the data length is larger than the MTU of the
 		 * outgoing interface.
 		 * Notify the error by sending IPV6_PATHMTU ancillary data as
 		 * well as returning an error code (the latter is not described
 		 * in the API spec.)
 		 */
 		u_int32_t mtu32;
 		struct ip6ctlparam ip6cp;
 
 		mtu32 = (u_int32_t)mtu;
 		bzero(&ip6cp, sizeof(ip6cp));
 		ip6cp.ip6c_cmdarg = (void *)&mtu32;
 		pfctlinput2(PRC_MSGSIZE, (struct sockaddr *)&ro_pmtu->ro_dst,
 		    (void *)&ip6cp);
 
 		error = EMSGSIZE;
 		goto bad;
 	}
 
 	/*
 	 * transmit packet without fragmentation
 	 */
 	if (dontfrag || (!alwaysfrag && tlen <= mtu)) {	/* case 1-a and 2-a */
 		struct in6_ifaddr *ia6;
 
 		ip6 = mtod(m, struct ip6_hdr *);
 		ia6 = in6_ifawithifp(ifp, &ip6->ip6_src);
 		if (ia6) {
 			/* Record statistics for this interface address. */
 			ia6->ia_ifa.if_opackets++;
 			ia6->ia_ifa.if_obytes += m->m_pkthdr.len;
 		}
 		error = nd6_output(ifp, origifp, m, dst, ro->ro_rt);
 		goto done;
 	}
 
 	/*
 	 * try to fragment the packet.  case 1-b and 3
 	 */
 	if (mtu < IPV6_MMTU) {
 		/* path MTU cannot be less than IPV6_MMTU */
 		error = EMSGSIZE;
 		in6_ifstat_inc(ifp, ifs6_out_fragfail);
 		goto bad;
 	} else if (ip6->ip6_plen == 0) {
 		/* jumbo payload cannot be fragmented */
 		error = EMSGSIZE;
 		in6_ifstat_inc(ifp, ifs6_out_fragfail);
 		goto bad;
 	} else {
 		struct mbuf **mnext, *m_frgpart;
 		struct ip6_frag *ip6f;
 		u_int32_t id = htonl(ip6_randomid());
 		u_char nextproto;
 
 		int qslots = ifp->if_snd.ifq_maxlen - ifp->if_snd.ifq_len;
 
 		/*
 		 * Too large for the destination or interface;
 		 * fragment if possible.
 		 * Must be able to put at least 8 bytes per fragment.
 		 */
 		hlen = unfragpartlen;
 		if (mtu > IPV6_MAXPACKET)
 			mtu = IPV6_MAXPACKET;
 
 		len = (mtu - hlen - sizeof(struct ip6_frag)) & ~7;
 		if (len < 8) {
 			error = EMSGSIZE;
 			in6_ifstat_inc(ifp, ifs6_out_fragfail);
 			goto bad;
 		}
 
 		/*
 		 * Verify that we have any chance at all of being able to queue
 		 *      the packet or packet fragments
 		 */
 		if (qslots <= 0 || ((u_int)qslots * (mtu - hlen)
 		    < tlen  /* - hlen */)) {
 			error = ENOBUFS;
 			V_ip6stat.ip6s_odropped++;
 			goto bad;
 		}
 
 		mnext = &m->m_nextpkt;
 
 		/*
 		 * Change the next header field of the last header in the
 		 * unfragmentable part.
 		 */
 		if (exthdrs.ip6e_rthdr) {
 			nextproto = *mtod(exthdrs.ip6e_rthdr, u_char *);
 			*mtod(exthdrs.ip6e_rthdr, u_char *) = IPPROTO_FRAGMENT;
 		} else if (exthdrs.ip6e_dest1) {
 			nextproto = *mtod(exthdrs.ip6e_dest1, u_char *);
 			*mtod(exthdrs.ip6e_dest1, u_char *) = IPPROTO_FRAGMENT;
 		} else if (exthdrs.ip6e_hbh) {
 			nextproto = *mtod(exthdrs.ip6e_hbh, u_char *);
 			*mtod(exthdrs.ip6e_hbh, u_char *) = IPPROTO_FRAGMENT;
 		} else {
 			nextproto = ip6->ip6_nxt;
 			ip6->ip6_nxt = IPPROTO_FRAGMENT;
 		}
 
 		/*
 		 * Loop through length of segment after first fragment,
 		 * make new header and copy data of each part and link onto
 		 * chain.
 		 */
 		m0 = m;
 		for (off = hlen; off < tlen; off += len) {
 			MGETHDR(m, M_DONTWAIT, MT_HEADER);
 			if (!m) {
 				error = ENOBUFS;
 				V_ip6stat.ip6s_odropped++;
 				goto sendorfree;
 			}
 			m->m_pkthdr.rcvif = NULL;
 			m->m_flags = m0->m_flags & M_COPYFLAGS;
 			*mnext = m;
 			mnext = &m->m_nextpkt;
 			m->m_data += max_linkhdr;
 			mhip6 = mtod(m, struct ip6_hdr *);
 			*mhip6 = *ip6;
 			m->m_len = sizeof(*mhip6);
 			error = ip6_insertfraghdr(m0, m, hlen, &ip6f);
 			if (error) {
 				V_ip6stat.ip6s_odropped++;
 				goto sendorfree;
 			}
 			ip6f->ip6f_offlg = htons((u_short)((off - hlen) & ~7));
 			if (off + len >= tlen)
 				len = tlen - off;
 			else
 				ip6f->ip6f_offlg |= IP6F_MORE_FRAG;
 			mhip6->ip6_plen = htons((u_short)(len + hlen +
 			    sizeof(*ip6f) - sizeof(struct ip6_hdr)));
 			if ((m_frgpart = m_copy(m0, off, len)) == 0) {
 				error = ENOBUFS;
 				V_ip6stat.ip6s_odropped++;
 				goto sendorfree;
 			}
 			m_cat(m, m_frgpart);
 			m->m_pkthdr.len = len + hlen + sizeof(*ip6f);
 			m->m_pkthdr.rcvif = NULL;
 			ip6f->ip6f_reserved = 0;
 			ip6f->ip6f_ident = id;
 			ip6f->ip6f_nxt = nextproto;
 			V_ip6stat.ip6s_ofragments++;
 			in6_ifstat_inc(ifp, ifs6_out_fragcreat);
 		}
 
 		in6_ifstat_inc(ifp, ifs6_out_fragok);
 	}
 
 	/*
 	 * Remove leading garbages.
 	 */
 sendorfree:
 	m = m0->m_nextpkt;
 	m0->m_nextpkt = 0;
 	m_freem(m0);
 	for (m0 = m; m; m = m0) {
 		m0 = m->m_nextpkt;
 		m->m_nextpkt = 0;
 		if (error == 0) {
 			/* Record statistics for this interface address. */
 			if (ia) {
 				ia->ia_ifa.if_opackets++;
 				ia->ia_ifa.if_obytes += m->m_pkthdr.len;
 			}
 			error = nd6_output(ifp, origifp, m, dst, ro->ro_rt);
 		} else
 			m_freem(m);
 	}
 
 	if (error == 0)
 		V_ip6stat.ip6s_fragmented++;
 
 done:
 	if (ro == &ip6route && ro->ro_rt) { /* brace necessary for RTFREE */
 		RTFREE(ro->ro_rt);
 	} else if (ro_pmtu == &ip6route && ro_pmtu->ro_rt) {
 		RTFREE(ro_pmtu->ro_rt);
 	}
 #ifdef IPSEC
 	if (sp != NULL)
 		KEY_FREESP(&sp);
 #endif
 
 	return (error);
 
 freehdrs:
 	m_freem(exthdrs.ip6e_hbh);	/* m_freem will check if mbuf is 0 */
 	m_freem(exthdrs.ip6e_dest1);
 	m_freem(exthdrs.ip6e_rthdr);
 	m_freem(exthdrs.ip6e_dest2);
 	/* FALLTHROUGH */
 bad:
 	if (m)
 		m_freem(m);
 	goto done;
 }
 
 static int
 ip6_copyexthdr(struct mbuf **mp, caddr_t hdr, int hlen)
 {
 	struct mbuf *m;
 
 	if (hlen > MCLBYTES)
 		return (ENOBUFS); /* XXX */
 
 	MGET(m, M_DONTWAIT, MT_DATA);
 	if (!m)
 		return (ENOBUFS);
 
 	if (hlen > MLEN) {
 		MCLGET(m, M_DONTWAIT);
 		if ((m->m_flags & M_EXT) == 0) {
 			m_free(m);
 			return (ENOBUFS);
 		}
 	}
 	m->m_len = hlen;
 	if (hdr)
 		bcopy(hdr, mtod(m, caddr_t), hlen);
 
 	*mp = m;
 	return (0);
 }
 
 /*
  * Insert jumbo payload option.
  */
 static int
 ip6_insert_jumboopt(struct ip6_exthdrs *exthdrs, u_int32_t plen)
 {
 	struct mbuf *mopt;
 	u_char *optbuf;
 	u_int32_t v;
 
 #define JUMBOOPTLEN	8	/* length of jumbo payload option and padding */
 
 	/*
 	 * If there is no hop-by-hop options header, allocate new one.
 	 * If there is one but it doesn't have enough space to store the
 	 * jumbo payload option, allocate a cluster to store the whole options.
 	 * Otherwise, use it to store the options.
 	 */
 	if (exthdrs->ip6e_hbh == 0) {
 		MGET(mopt, M_DONTWAIT, MT_DATA);
 		if (mopt == 0)
 			return (ENOBUFS);
 		mopt->m_len = JUMBOOPTLEN;
 		optbuf = mtod(mopt, u_char *);
 		optbuf[1] = 0;	/* = ((JUMBOOPTLEN) >> 3) - 1 */
 		exthdrs->ip6e_hbh = mopt;
 	} else {
 		struct ip6_hbh *hbh;
 
 		mopt = exthdrs->ip6e_hbh;
 		if (M_TRAILINGSPACE(mopt) < JUMBOOPTLEN) {
 			/*
 			 * XXX assumption:
 			 * - exthdrs->ip6e_hbh is not referenced from places
 			 *   other than exthdrs.
 			 * - exthdrs->ip6e_hbh is not an mbuf chain.
 			 */
 			int oldoptlen = mopt->m_len;
 			struct mbuf *n;
 
 			/*
 			 * XXX: give up if the whole (new) hbh header does
 			 * not fit even in an mbuf cluster.
 			 */
 			if (oldoptlen + JUMBOOPTLEN > MCLBYTES)
 				return (ENOBUFS);
 
 			/*
 			 * As a consequence, we must always prepare a cluster
 			 * at this point.
 			 */
 			MGET(n, M_DONTWAIT, MT_DATA);
 			if (n) {
 				MCLGET(n, M_DONTWAIT);
 				if ((n->m_flags & M_EXT) == 0) {
 					m_freem(n);
 					n = NULL;
 				}
 			}
 			if (!n)
 				return (ENOBUFS);
 			n->m_len = oldoptlen + JUMBOOPTLEN;
 			bcopy(mtod(mopt, caddr_t), mtod(n, caddr_t),
 			    oldoptlen);
 			optbuf = mtod(n, caddr_t) + oldoptlen;
 			m_freem(mopt);
 			mopt = exthdrs->ip6e_hbh = n;
 		} else {
 			optbuf = mtod(mopt, u_char *) + mopt->m_len;
 			mopt->m_len += JUMBOOPTLEN;
 		}
 		optbuf[0] = IP6OPT_PADN;
 		optbuf[1] = 1;
 
 		/*
 		 * Adjust the header length according to the pad and
 		 * the jumbo payload option.
 		 */
 		hbh = mtod(mopt, struct ip6_hbh *);
 		hbh->ip6h_len += (JUMBOOPTLEN >> 3);
 	}
 
 	/* fill in the option. */
 	optbuf[2] = IP6OPT_JUMBO;
 	optbuf[3] = 4;
 	v = (u_int32_t)htonl(plen + JUMBOOPTLEN);
 	bcopy(&v, &optbuf[4], sizeof(u_int32_t));
 
 	/* finally, adjust the packet header length */
 	exthdrs->ip6e_ip6->m_pkthdr.len += JUMBOOPTLEN;
 
 	return (0);
 #undef JUMBOOPTLEN
 }
 
 /*
  * Insert fragment header and copy unfragmentable header portions.
  */
 static int
 ip6_insertfraghdr(struct mbuf *m0, struct mbuf *m, int hlen,
     struct ip6_frag **frghdrp)
 {
 	struct mbuf *n, *mlast;
 
 	if (hlen > sizeof(struct ip6_hdr)) {
 		n = m_copym(m0, sizeof(struct ip6_hdr),
 		    hlen - sizeof(struct ip6_hdr), M_DONTWAIT);
 		if (n == 0)
 			return (ENOBUFS);
 		m->m_next = n;
 	} else
 		n = m;
 
 	/* Search for the last mbuf of unfragmentable part. */
 	for (mlast = n; mlast->m_next; mlast = mlast->m_next)
 		;
 
 	if ((mlast->m_flags & M_EXT) == 0 &&
 	    M_TRAILINGSPACE(mlast) >= sizeof(struct ip6_frag)) {
 		/* use the trailing space of the last mbuf for the fragment hdr */
 		*frghdrp = (struct ip6_frag *)(mtod(mlast, caddr_t) +
 		    mlast->m_len);
 		mlast->m_len += sizeof(struct ip6_frag);
 		m->m_pkthdr.len += sizeof(struct ip6_frag);
 	} else {
 		/* allocate a new mbuf for the fragment header */
 		struct mbuf *mfrg;
 
 		MGET(mfrg, M_DONTWAIT, MT_DATA);
 		if (mfrg == 0)
 			return (ENOBUFS);
 		mfrg->m_len = sizeof(struct ip6_frag);
 		*frghdrp = mtod(mfrg, struct ip6_frag *);
 		mlast->m_next = mfrg;
 	}
 
 	return (0);
 }
 
 static int
 ip6_getpmtu(struct route_in6 *ro_pmtu, struct route_in6 *ro,
     struct ifnet *ifp, struct in6_addr *dst, u_long *mtup,
     int *alwaysfragp)
 {
 	u_int32_t mtu = 0;
 	int alwaysfrag = 0;
 	int error = 0;
 
 	if (ro_pmtu != ro) {
 		/* The first hop and the final destination may differ. */
 		struct sockaddr_in6 *sa6_dst =
 		    (struct sockaddr_in6 *)&ro_pmtu->ro_dst;
 		if (ro_pmtu->ro_rt &&
 		    ((ro_pmtu->ro_rt->rt_flags & RTF_UP) == 0 ||
 		     !IN6_ARE_ADDR_EQUAL(&sa6_dst->sin6_addr, dst))) {
 			RTFREE(ro_pmtu->ro_rt);
 			ro_pmtu->ro_rt = (struct rtentry *)NULL;
 		}
 		if (ro_pmtu->ro_rt == NULL) {
 			bzero(sa6_dst, sizeof(*sa6_dst));
 			sa6_dst->sin6_family = AF_INET6;
 			sa6_dst->sin6_len = sizeof(struct sockaddr_in6);
 			sa6_dst->sin6_addr = *dst;
 
 			rtalloc((struct route *)ro_pmtu);
 		}
 	}
 	if (ro_pmtu->ro_rt) {
 		u_int32_t ifmtu;
 		struct in_conninfo inc;
 
 		bzero(&inc, sizeof(inc));
-		inc.inc_flags = 1; /* IPv6 */
+		inc.inc_flags |= INC_ISIPV6;
 		inc.inc6_faddr = *dst;
 
 		if (ifp == NULL)
 			ifp = ro_pmtu->ro_rt->rt_ifp;
 		ifmtu = IN6_LINKMTU(ifp);
 		mtu = tcp_hc_getmtu(&inc);
 		if (mtu)
 			mtu = min(mtu, ro_pmtu->ro_rt->rt_rmx.rmx_mtu);
 		else
 			mtu = ro_pmtu->ro_rt->rt_rmx.rmx_mtu;
 		if (mtu == 0)
 			mtu = ifmtu;
 		else if (mtu < IPV6_MMTU) {
 			/*
 			 * RFC2460 section 5, last paragraph:
 			 * if we record ICMPv6 too big message with
 			 * mtu < IPV6_MMTU, transmit packets sized IPV6_MMTU
 			 * or smaller, with framgent header attached.
 			 * (fragment header is needed regardless from the
 			 * packet size, for translators to identify packets)
 			 */
 			alwaysfrag = 1;
 			mtu = IPV6_MMTU;
 		} else if (mtu > ifmtu) {
 			/*
 			 * The MTU on the route is larger than the MTU on
 			 * the interface!  This shouldn't happen, unless the
 			 * MTU of the interface has been changed after the
 			 * interface was brought up.  Change the MTU in the
 			 * route to match the interface MTU (as long as the
 			 * field isn't locked).
 			 */
 			mtu = ifmtu;
 			ro_pmtu->ro_rt->rt_rmx.rmx_mtu = mtu;
 		}
 	} else if (ifp) {
 		mtu = IN6_LINKMTU(ifp);
 	} else
 		error = EHOSTUNREACH; /* XXX */
 
 	*mtup = mtu;
 	if (alwaysfragp)
 		*alwaysfragp = alwaysfrag;
 	return (error);
 }
 
 /*
  * IP6 socket option processing.
  */
 int
 ip6_ctloutput(struct socket *so, struct sockopt *sopt)
 {
 	int optdatalen, uproto;
 	void *optdata;
 	struct inpcb *in6p = sotoinpcb(so);
 	int error, optval;
 	int level, op, optname;
 	int optlen;
 	struct thread *td;
 
 	level = sopt->sopt_level;
 	op = sopt->sopt_dir;
 	optname = sopt->sopt_name;
 	optlen = sopt->sopt_valsize;
 	td = sopt->sopt_td;
 	error = 0;
 	optval = 0;
 	uproto = (int)so->so_proto->pr_protocol;
 
 	if (level == IPPROTO_IPV6) {
 		switch (op) {
 
 		case SOPT_SET:
 			switch (optname) {
 			case IPV6_2292PKTOPTIONS:
 #ifdef IPV6_PKTOPTIONS
 			case IPV6_PKTOPTIONS:
 #endif
 			{
 				struct mbuf *m;
 
 				error = soopt_getm(sopt, &m); /* XXX */
 				if (error != 0)
 					break;
 				error = soopt_mcopyin(sopt, m); /* XXX */
 				if (error != 0)
 					break;
 				error = ip6_pcbopts(&in6p->in6p_outputopts,
 						    m, so, sopt);
 				m_freem(m); /* XXX */
 				break;
 			}
 
 			/*
 			 * Use of some Hop-by-Hop options or some
 			 * Destination options, might require special
 			 * privilege.  That is, normal applications
 			 * (without special privilege) might be forbidden
 			 * from setting certain options in outgoing packets,
 			 * and might never see certain options in received
 			 * packets. [RFC 2292 Section 6]
 			 * KAME specific note:
 			 *  KAME prevents non-privileged users from sending or
 			 *  receiving ANY hbh/dst options in order to avoid
 			 *  overhead of parsing options in the kernel.
 			 */
 			case IPV6_RECVHOPOPTS:
 			case IPV6_RECVDSTOPTS:
 			case IPV6_RECVRTHDRDSTOPTS:
 				if (td != NULL) {
 					error = priv_check(td,
 					    PRIV_NETINET_SETHDROPTS);
 					if (error)
 						break;
 				}
 				/* FALLTHROUGH */
 			case IPV6_UNICAST_HOPS:
 			case IPV6_HOPLIMIT:
 			case IPV6_FAITH:
 
 			case IPV6_RECVPKTINFO:
 			case IPV6_RECVHOPLIMIT:
 			case IPV6_RECVRTHDR:
 			case IPV6_RECVPATHMTU:
 			case IPV6_RECVTCLASS:
 			case IPV6_V6ONLY:
 			case IPV6_AUTOFLOWLABEL:
 				if (optlen != sizeof(int)) {
 					error = EINVAL;
 					break;
 				}
 				error = sooptcopyin(sopt, &optval,
 					sizeof optval, sizeof optval);
 				if (error)
 					break;
 				switch (optname) {
 
 				case IPV6_UNICAST_HOPS:
 					if (optval < -1 || optval >= 256)
 						error = EINVAL;
 					else {
 						/* -1 = kernel default */
 						in6p->in6p_hops = optval;
 						if ((in6p->inp_vflag &
 						     INP_IPV4) != 0)
 							in6p->inp_ip_ttl = optval;
 					}
 					break;
 #define OPTSET(bit) \
 do { \
 	if (optval) \
 		in6p->inp_flags |= (bit); \
 	else \
 		in6p->inp_flags &= ~(bit); \
 } while (/*CONSTCOND*/ 0)
 #define OPTSET2292(bit) \
 do { \
 	in6p->inp_flags |= IN6P_RFC2292; \
 	if (optval) \
 		in6p->inp_flags |= (bit); \
 	else \
 		in6p->inp_flags &= ~(bit); \
 } while (/*CONSTCOND*/ 0)
 #define OPTBIT(bit) (in6p->inp_flags & (bit) ? 1 : 0)
 
 				case IPV6_RECVPKTINFO:
 					/* cannot mix with RFC2292 */
 					if (OPTBIT(IN6P_RFC2292)) {
 						error = EINVAL;
 						break;
 					}
 					OPTSET(IN6P_PKTINFO);
 					break;
 
 				case IPV6_HOPLIMIT:
 				{
 					struct ip6_pktopts **optp;
 
 					/* cannot mix with RFC2292 */
 					if (OPTBIT(IN6P_RFC2292)) {
 						error = EINVAL;
 						break;
 					}
 					optp = &in6p->in6p_outputopts;
 					error = ip6_pcbopt(IPV6_HOPLIMIT,
 					    (u_char *)&optval, sizeof(optval),
 					    optp, (td != NULL) ? td->td_ucred :
 					    NULL, uproto);
 					break;
 				}
 
 				case IPV6_RECVHOPLIMIT:
 					/* cannot mix with RFC2292 */
 					if (OPTBIT(IN6P_RFC2292)) {
 						error = EINVAL;
 						break;
 					}
 					OPTSET(IN6P_HOPLIMIT);
 					break;
 
 				case IPV6_RECVHOPOPTS:
 					/* cannot mix with RFC2292 */
 					if (OPTBIT(IN6P_RFC2292)) {
 						error = EINVAL;
 						break;
 					}
 					OPTSET(IN6P_HOPOPTS);
 					break;
 
 				case IPV6_RECVDSTOPTS:
 					/* cannot mix with RFC2292 */
 					if (OPTBIT(IN6P_RFC2292)) {
 						error = EINVAL;
 						break;
 					}
 					OPTSET(IN6P_DSTOPTS);
 					break;
 
 				case IPV6_RECVRTHDRDSTOPTS:
 					/* cannot mix with RFC2292 */
 					if (OPTBIT(IN6P_RFC2292)) {
 						error = EINVAL;
 						break;
 					}
 					OPTSET(IN6P_RTHDRDSTOPTS);
 					break;
 
 				case IPV6_RECVRTHDR:
 					/* cannot mix with RFC2292 */
 					if (OPTBIT(IN6P_RFC2292)) {
 						error = EINVAL;
 						break;
 					}
 					OPTSET(IN6P_RTHDR);
 					break;
 
 				case IPV6_FAITH:
 					OPTSET(IN6P_FAITH);
 					break;
 
 				case IPV6_RECVPATHMTU:
 					/*
 					 * We ignore this option for TCP
 					 * sockets.
 					 * (RFC3542 leaves this case
 					 * unspecified.)
 					 */
 					if (uproto != IPPROTO_TCP)
 						OPTSET(IN6P_MTU);
 					break;
 
 				case IPV6_V6ONLY:
 					/*
 					 * make setsockopt(IPV6_V6ONLY)
 					 * available only prior to bind(2).
 					 * see ipng mailing list, Jun 22 2001.
 					 */
 					if (in6p->inp_lport ||
 					    !IN6_IS_ADDR_UNSPECIFIED(&in6p->in6p_laddr)) {
 						error = EINVAL;
 						break;
 					}
 					OPTSET(IN6P_IPV6_V6ONLY);
 					if (optval)
 						in6p->inp_vflag &= ~INP_IPV4;
 					else
 						in6p->inp_vflag |= INP_IPV4;
 					break;
 				case IPV6_RECVTCLASS:
 					/* cannot mix with RFC2292 XXX */
 					if (OPTBIT(IN6P_RFC2292)) {
 						error = EINVAL;
 						break;
 					}
 					OPTSET(IN6P_TCLASS);
 					break;
 				case IPV6_AUTOFLOWLABEL:
 					OPTSET(IN6P_AUTOFLOWLABEL);
 					break;
 
 				}
 				break;
 
 			case IPV6_TCLASS:
 			case IPV6_DONTFRAG:
 			case IPV6_USE_MIN_MTU:
 			case IPV6_PREFER_TEMPADDR:
 				if (optlen != sizeof(optval)) {
 					error = EINVAL;
 					break;
 				}
 				error = sooptcopyin(sopt, &optval,
 					sizeof optval, sizeof optval);
 				if (error)
 					break;
 				{
 					struct ip6_pktopts **optp;
 					optp = &in6p->in6p_outputopts;
 					error = ip6_pcbopt(optname,
 					    (u_char *)&optval, sizeof(optval),
 					    optp, (td != NULL) ? td->td_ucred :
 					    NULL, uproto);
 					break;
 				}
 
 			case IPV6_2292PKTINFO:
 			case IPV6_2292HOPLIMIT:
 			case IPV6_2292HOPOPTS:
 			case IPV6_2292DSTOPTS:
 			case IPV6_2292RTHDR:
 				/* RFC 2292 */
 				if (optlen != sizeof(int)) {
 					error = EINVAL;
 					break;
 				}
 				error = sooptcopyin(sopt, &optval,
 					sizeof optval, sizeof optval);
 				if (error)
 					break;
 				switch (optname) {
 				case IPV6_2292PKTINFO:
 					OPTSET2292(IN6P_PKTINFO);
 					break;
 				case IPV6_2292HOPLIMIT:
 					OPTSET2292(IN6P_HOPLIMIT);
 					break;
 				case IPV6_2292HOPOPTS:
 					/*
 					 * Check super-user privilege.
 					 * See comments for IPV6_RECVHOPOPTS.
 					 */
 					if (td != NULL) {
 						error = priv_check(td,
 						    PRIV_NETINET_SETHDROPTS);
 						if (error)
 							return (error);
 					}
 					OPTSET2292(IN6P_HOPOPTS);
 					break;
 				case IPV6_2292DSTOPTS:
 					if (td != NULL) {
 						error = priv_check(td,
 						    PRIV_NETINET_SETHDROPTS);
 						if (error)
 							return (error);
 					}
 					OPTSET2292(IN6P_DSTOPTS|IN6P_RTHDRDSTOPTS); /* XXX */
 					break;
 				case IPV6_2292RTHDR:
 					OPTSET2292(IN6P_RTHDR);
 					break;
 				}
 				break;
 			case IPV6_PKTINFO:
 			case IPV6_HOPOPTS:
 			case IPV6_RTHDR:
 			case IPV6_DSTOPTS:
 			case IPV6_RTHDRDSTOPTS:
 			case IPV6_NEXTHOP:
 			{
 				/* new advanced API (RFC3542) */
 				u_char *optbuf;
 				u_char optbuf_storage[MCLBYTES];
 				int optlen;
 				struct ip6_pktopts **optp;
 
 				/* cannot mix with RFC2292 */
 				if (OPTBIT(IN6P_RFC2292)) {
 					error = EINVAL;
 					break;
 				}
 
 				/*
 				 * We only ensure valsize is not too large
 				 * here.  Further validation will be done
 				 * later.
 				 */
 				error = sooptcopyin(sopt, optbuf_storage,
 				    sizeof(optbuf_storage), 0);
 				if (error)
 					break;
 				optlen = sopt->sopt_valsize;
 				optbuf = optbuf_storage;
 				optp = &in6p->in6p_outputopts;
 				error = ip6_pcbopt(optname, optbuf, optlen,
 				    optp, (td != NULL) ? td->td_ucred : NULL,
 				    uproto);
 				break;
 			}
 #undef OPTSET
 
 			case IPV6_MULTICAST_IF:
 			case IPV6_MULTICAST_HOPS:
 			case IPV6_MULTICAST_LOOP:
 			case IPV6_JOIN_GROUP:
 			case IPV6_LEAVE_GROUP:
 			    {
 				if (sopt->sopt_valsize > MLEN) {
 					error = EMSGSIZE;
 					break;
 				}
 				/* XXX */
 			    }
 			    /* FALLTHROUGH */
 			    {
 				struct mbuf *m;
 
 				if (sopt->sopt_valsize > MCLBYTES) {
 					error = EMSGSIZE;
 					break;
 				}
 				/* XXX */
 				MGET(m, sopt->sopt_td ? M_WAIT : M_DONTWAIT, MT_DATA);
 				if (m == 0) {
 					error = ENOBUFS;
 					break;
 				}
 				if (sopt->sopt_valsize > MLEN) {
 					MCLGET(m, sopt->sopt_td ? M_WAIT : M_DONTWAIT);
 					if ((m->m_flags & M_EXT) == 0) {
 						m_free(m);
 						error = ENOBUFS;
 						break;
 					}
 				}
 				m->m_len = sopt->sopt_valsize;
 				error = sooptcopyin(sopt, mtod(m, char *),
 						    m->m_len, m->m_len);
 				if (error) {
 					(void)m_free(m);
 					break;
 				}
 				error =	ip6_setmoptions(sopt->sopt_name,
 							&in6p->in6p_moptions,
 							m);
 				(void)m_free(m);
 			    }
 				break;
 
 			case IPV6_PORTRANGE:
 				error = sooptcopyin(sopt, &optval,
 				    sizeof optval, sizeof optval);
 				if (error)
 					break;
 
 				switch (optval) {
 				case IPV6_PORTRANGE_DEFAULT:
 					in6p->inp_flags &= ~(IN6P_LOWPORT);
 					in6p->inp_flags &= ~(IN6P_HIGHPORT);
 					break;
 
 				case IPV6_PORTRANGE_HIGH:
 					in6p->inp_flags &= ~(IN6P_LOWPORT);
 					in6p->inp_flags |= IN6P_HIGHPORT;
 					break;
 
 				case IPV6_PORTRANGE_LOW:
 					in6p->inp_flags &= ~(IN6P_HIGHPORT);
 					in6p->inp_flags |= IN6P_LOWPORT;
 					break;
 
 				default:
 					error = EINVAL;
 					break;
 				}
 				break;
 
 #ifdef IPSEC
 			case IPV6_IPSEC_POLICY:
 			{
 				caddr_t req;
 				struct mbuf *m;
 
 				if ((error = soopt_getm(sopt, &m)) != 0) /* XXX */
 					break;
 				if ((error = soopt_mcopyin(sopt, m)) != 0) /* XXX */
 					break;
 				req = mtod(m, caddr_t);
 				error = ipsec6_set_policy(in6p, optname, req,
 				    m->m_len, (sopt->sopt_td != NULL) ?
 				    sopt->sopt_td->td_ucred : NULL);
 				m_freem(m);
 				break;
 			}
 #endif /* IPSEC */
 
 			default:
 				error = ENOPROTOOPT;
 				break;
 			}
 			break;
 
 		case SOPT_GET:
 			switch (optname) {
 
 			case IPV6_2292PKTOPTIONS:
 #ifdef IPV6_PKTOPTIONS
 			case IPV6_PKTOPTIONS:
 #endif
 				/*
 				 * RFC3542 (effectively) deprecated the
 				 * semantics of the 2292-style pktoptions.
 				 * Since it was not reliable in nature (i.e.,
 				 * applications had to expect the lack of some
 				 * information after all), it would make sense
 				 * to simplify this part by always returning
 				 * empty data.
 				 */
 				sopt->sopt_valsize = 0;
 				break;
 
 			case IPV6_RECVHOPOPTS:
 			case IPV6_RECVDSTOPTS:
 			case IPV6_RECVRTHDRDSTOPTS:
 			case IPV6_UNICAST_HOPS:
 			case IPV6_RECVPKTINFO:
 			case IPV6_RECVHOPLIMIT:
 			case IPV6_RECVRTHDR:
 			case IPV6_RECVPATHMTU:
 
 			case IPV6_FAITH:
 			case IPV6_V6ONLY:
 			case IPV6_PORTRANGE:
 			case IPV6_RECVTCLASS:
 			case IPV6_AUTOFLOWLABEL:
 				switch (optname) {
 
 				case IPV6_RECVHOPOPTS:
 					optval = OPTBIT(IN6P_HOPOPTS);
 					break;
 
 				case IPV6_RECVDSTOPTS:
 					optval = OPTBIT(IN6P_DSTOPTS);
 					break;
 
 				case IPV6_RECVRTHDRDSTOPTS:
 					optval = OPTBIT(IN6P_RTHDRDSTOPTS);
 					break;
 
 				case IPV6_UNICAST_HOPS:
 					optval = in6p->in6p_hops;
 					break;
 
 				case IPV6_RECVPKTINFO:
 					optval = OPTBIT(IN6P_PKTINFO);
 					break;
 
 				case IPV6_RECVHOPLIMIT:
 					optval = OPTBIT(IN6P_HOPLIMIT);
 					break;
 
 				case IPV6_RECVRTHDR:
 					optval = OPTBIT(IN6P_RTHDR);
 					break;
 
 				case IPV6_RECVPATHMTU:
 					optval = OPTBIT(IN6P_MTU);
 					break;
 
 				case IPV6_FAITH:
 					optval = OPTBIT(IN6P_FAITH);
 					break;
 
 				case IPV6_V6ONLY:
 					optval = OPTBIT(IN6P_IPV6_V6ONLY);
 					break;
 
 				case IPV6_PORTRANGE:
 				    {
 					int flags;
 					flags = in6p->inp_flags;
 					if (flags & IN6P_HIGHPORT)
 						optval = IPV6_PORTRANGE_HIGH;
 					else if (flags & IN6P_LOWPORT)
 						optval = IPV6_PORTRANGE_LOW;
 					else
 						optval = 0;
 					break;
 				    }
 				case IPV6_RECVTCLASS:
 					optval = OPTBIT(IN6P_TCLASS);
 					break;
 
 				case IPV6_AUTOFLOWLABEL:
 					optval = OPTBIT(IN6P_AUTOFLOWLABEL);
 					break;
 				}
 				if (error)
 					break;
 				error = sooptcopyout(sopt, &optval,
 					sizeof optval);
 				break;
 
 			case IPV6_PATHMTU:
 			{
 				u_long pmtu = 0;
 				struct ip6_mtuinfo mtuinfo;
 				struct route_in6 sro;
 
 				bzero(&sro, sizeof(sro));
 
 				if (!(so->so_state & SS_ISCONNECTED))
 					return (ENOTCONN);
 				/*
 				 * XXX: we dot not consider the case of source
 				 * routing, or optional information to specify
 				 * the outgoing interface.
 				 */
 				error = ip6_getpmtu(&sro, NULL, NULL,
 				    &in6p->in6p_faddr, &pmtu, NULL);
 				if (sro.ro_rt)
 					RTFREE(sro.ro_rt);
 				if (error)
 					break;
 				if (pmtu > IPV6_MAXPACKET)
 					pmtu = IPV6_MAXPACKET;
 
 				bzero(&mtuinfo, sizeof(mtuinfo));
 				mtuinfo.ip6m_mtu = (u_int32_t)pmtu;
 				optdata = (void *)&mtuinfo;
 				optdatalen = sizeof(mtuinfo);
 				error = sooptcopyout(sopt, optdata,
 				    optdatalen);
 				break;
 			}
 
 			case IPV6_2292PKTINFO:
 			case IPV6_2292HOPLIMIT:
 			case IPV6_2292HOPOPTS:
 			case IPV6_2292RTHDR:
 			case IPV6_2292DSTOPTS:
 				switch (optname) {
 				case IPV6_2292PKTINFO:
 					optval = OPTBIT(IN6P_PKTINFO);
 					break;
 				case IPV6_2292HOPLIMIT:
 					optval = OPTBIT(IN6P_HOPLIMIT);
 					break;
 				case IPV6_2292HOPOPTS:
 					optval = OPTBIT(IN6P_HOPOPTS);
 					break;
 				case IPV6_2292RTHDR:
 					optval = OPTBIT(IN6P_RTHDR);
 					break;
 				case IPV6_2292DSTOPTS:
 					optval = OPTBIT(IN6P_DSTOPTS|IN6P_RTHDRDSTOPTS);
 					break;
 				}
 				error = sooptcopyout(sopt, &optval,
 				    sizeof optval);
 				break;
 			case IPV6_PKTINFO:
 			case IPV6_HOPOPTS:
 			case IPV6_RTHDR:
 			case IPV6_DSTOPTS:
 			case IPV6_RTHDRDSTOPTS:
 			case IPV6_NEXTHOP:
 			case IPV6_TCLASS:
 			case IPV6_DONTFRAG:
 			case IPV6_USE_MIN_MTU:
 			case IPV6_PREFER_TEMPADDR:
 				error = ip6_getpcbopt(in6p->in6p_outputopts,
 				    optname, sopt);
 				break;
 
 			case IPV6_MULTICAST_IF:
 			case IPV6_MULTICAST_HOPS:
 			case IPV6_MULTICAST_LOOP:
 			case IPV6_JOIN_GROUP:
 			case IPV6_LEAVE_GROUP:
 			    {
 				struct mbuf *m;
 				error = ip6_getmoptions(sopt->sopt_name,
 				    in6p->in6p_moptions, &m);
 				if (error == 0)
 					error = sooptcopyout(sopt,
 					    mtod(m, char *), m->m_len);
 				m_freem(m);
 			    }
 				break;
 
 #ifdef IPSEC
 			case IPV6_IPSEC_POLICY:
 			  {
 				caddr_t req = NULL;
 				size_t len = 0;
 				struct mbuf *m = NULL;
 				struct mbuf **mp = &m;
 				size_t ovalsize = sopt->sopt_valsize;
 				caddr_t oval = (caddr_t)sopt->sopt_val;
 
 				error = soopt_getm(sopt, &m); /* XXX */
 				if (error != 0)
 					break;
 				error = soopt_mcopyin(sopt, m); /* XXX */
 				if (error != 0)
 					break;
 				sopt->sopt_valsize = ovalsize;
 				sopt->sopt_val = oval;
 				if (m) {
 					req = mtod(m, caddr_t);
 					len = m->m_len;
 				}
 				error = ipsec6_get_policy(in6p, req, len, mp);
 				if (error == 0)
 					error = soopt_mcopyout(sopt, m); /* XXX */
 				if (error == 0 && m)
 					m_freem(m);
 				break;
 			  }
 #endif /* IPSEC */
 
 			default:
 				error = ENOPROTOOPT;
 				break;
 			}
 			break;
 		}
 	} else {		/* level != IPPROTO_IPV6 */
 		error = EINVAL;
 	}
 	return (error);
 }
 
 int
 ip6_raw_ctloutput(struct socket *so, struct sockopt *sopt)
 {
 	int error = 0, optval, optlen;
 	const int icmp6off = offsetof(struct icmp6_hdr, icmp6_cksum);
 	struct inpcb *in6p = sotoinpcb(so);
 	int level, op, optname;
 
 	level = sopt->sopt_level;
 	op = sopt->sopt_dir;
 	optname = sopt->sopt_name;
 	optlen = sopt->sopt_valsize;
 
 	if (level != IPPROTO_IPV6) {
 		return (EINVAL);
 	}
 
 	switch (optname) {
 	case IPV6_CHECKSUM:
 		/*
 		 * For ICMPv6 sockets, no modification allowed for checksum
 		 * offset, permit "no change" values to help existing apps.
 		 *
 		 * RFC3542 says: "An attempt to set IPV6_CHECKSUM
 		 * for an ICMPv6 socket will fail."
 		 * The current behavior does not meet RFC3542.
 		 */
 		switch (op) {
 		case SOPT_SET:
 			if (optlen != sizeof(int)) {
 				error = EINVAL;
 				break;
 			}
 			error = sooptcopyin(sopt, &optval, sizeof(optval),
 					    sizeof(optval));
 			if (error)
 				break;
 			if ((optval % 2) != 0) {
 				/* the API assumes even offset values */
 				error = EINVAL;
 			} else if (so->so_proto->pr_protocol ==
 			    IPPROTO_ICMPV6) {
 				if (optval != icmp6off)
 					error = EINVAL;
 			} else
 				in6p->in6p_cksum = optval;
 			break;
 
 		case SOPT_GET:
 			if (so->so_proto->pr_protocol == IPPROTO_ICMPV6)
 				optval = icmp6off;
 			else
 				optval = in6p->in6p_cksum;
 
 			error = sooptcopyout(sopt, &optval, sizeof(optval));
 			break;
 
 		default:
 			error = EINVAL;
 			break;
 		}
 		break;
 
 	default:
 		error = ENOPROTOOPT;
 		break;
 	}
 
 	return (error);
 }
 
 /*
  * Set up IP6 options in pcb for insertion in output packets or
  * specifying behavior of outgoing packets.
  */
 static int
 ip6_pcbopts(struct ip6_pktopts **pktopt, struct mbuf *m,
     struct socket *so, struct sockopt *sopt)
 {
 	struct ip6_pktopts *opt = *pktopt;
 	int error = 0;
 	struct thread *td = sopt->sopt_td;
 
 	/* turn off any old options. */
 	if (opt) {
 #ifdef DIAGNOSTIC
 		if (opt->ip6po_pktinfo || opt->ip6po_nexthop ||
 		    opt->ip6po_hbh || opt->ip6po_dest1 || opt->ip6po_dest2 ||
 		    opt->ip6po_rhinfo.ip6po_rhi_rthdr)
 			printf("ip6_pcbopts: all specified options are cleared.\n");
 #endif
 		ip6_clearpktopts(opt, -1);
 	} else
 		opt = malloc(sizeof(*opt), M_IP6OPT, M_WAITOK);
 	*pktopt = NULL;
 
 	if (!m || m->m_len == 0) {
 		/*
 		 * Only turning off any previous options, regardless of
 		 * whether the opt is just created or given.
 		 */
 		free(opt, M_IP6OPT);
 		return (0);
 	}
 
 	/*  set options specified by user. */
 	if ((error = ip6_setpktopts(m, opt, NULL, (td != NULL) ?
 	    td->td_ucred : NULL, so->so_proto->pr_protocol)) != 0) {
 		ip6_clearpktopts(opt, -1); /* XXX: discard all options */
 		free(opt, M_IP6OPT);
 		return (error);
 	}
 	*pktopt = opt;
 	return (0);
 }
 
 /*
  * initialize ip6_pktopts.  beware that there are non-zero default values in
  * the struct.
  */
 void
 ip6_initpktopts(struct ip6_pktopts *opt)
 {
 
 	bzero(opt, sizeof(*opt));
 	opt->ip6po_hlim = -1;	/* -1 means default hop limit */
 	opt->ip6po_tclass = -1;	/* -1 means default traffic class */
 	opt->ip6po_minmtu = IP6PO_MINMTU_MCASTONLY;
 	opt->ip6po_prefer_tempaddr = IP6PO_TEMPADDR_SYSTEM;
 }
 
 static int
 ip6_pcbopt(int optname, u_char *buf, int len, struct ip6_pktopts **pktopt,
     struct ucred *cred, int uproto)
 {
 	struct ip6_pktopts *opt;
 
 	if (*pktopt == NULL) {
 		*pktopt = malloc(sizeof(struct ip6_pktopts), M_IP6OPT,
 		    M_WAITOK);
 		ip6_initpktopts(*pktopt);
 	}
 	opt = *pktopt;
 
 	return (ip6_setpktopt(optname, buf, len, opt, cred, 1, 0, uproto));
 }
 
 static int
 ip6_getpcbopt(struct ip6_pktopts *pktopt, int optname, struct sockopt *sopt)
 {
 	void *optdata = NULL;
 	int optdatalen = 0;
 	struct ip6_ext *ip6e;
 	int error = 0;
 	struct in6_pktinfo null_pktinfo;
 	int deftclass = 0, on;
 	int defminmtu = IP6PO_MINMTU_MCASTONLY;
 	int defpreftemp = IP6PO_TEMPADDR_SYSTEM;
 
 	switch (optname) {
 	case IPV6_PKTINFO:
 		if (pktopt && pktopt->ip6po_pktinfo)
 			optdata = (void *)pktopt->ip6po_pktinfo;
 		else {
 			/* XXX: we don't have to do this every time... */
 			bzero(&null_pktinfo, sizeof(null_pktinfo));
 			optdata = (void *)&null_pktinfo;
 		}
 		optdatalen = sizeof(struct in6_pktinfo);
 		break;
 	case IPV6_TCLASS:
 		if (pktopt && pktopt->ip6po_tclass >= 0)
 			optdata = (void *)&pktopt->ip6po_tclass;
 		else
 			optdata = (void *)&deftclass;
 		optdatalen = sizeof(int);
 		break;
 	case IPV6_HOPOPTS:
 		if (pktopt && pktopt->ip6po_hbh) {
 			optdata = (void *)pktopt->ip6po_hbh;
 			ip6e = (struct ip6_ext *)pktopt->ip6po_hbh;
 			optdatalen = (ip6e->ip6e_len + 1) << 3;
 		}
 		break;
 	case IPV6_RTHDR:
 		if (pktopt && pktopt->ip6po_rthdr) {
 			optdata = (void *)pktopt->ip6po_rthdr;
 			ip6e = (struct ip6_ext *)pktopt->ip6po_rthdr;
 			optdatalen = (ip6e->ip6e_len + 1) << 3;
 		}
 		break;
 	case IPV6_RTHDRDSTOPTS:
 		if (pktopt && pktopt->ip6po_dest1) {
 			optdata = (void *)pktopt->ip6po_dest1;
 			ip6e = (struct ip6_ext *)pktopt->ip6po_dest1;
 			optdatalen = (ip6e->ip6e_len + 1) << 3;
 		}
 		break;
 	case IPV6_DSTOPTS:
 		if (pktopt && pktopt->ip6po_dest2) {
 			optdata = (void *)pktopt->ip6po_dest2;
 			ip6e = (struct ip6_ext *)pktopt->ip6po_dest2;
 			optdatalen = (ip6e->ip6e_len + 1) << 3;
 		}
 		break;
 	case IPV6_NEXTHOP:
 		if (pktopt && pktopt->ip6po_nexthop) {
 			optdata = (void *)pktopt->ip6po_nexthop;
 			optdatalen = pktopt->ip6po_nexthop->sa_len;
 		}
 		break;
 	case IPV6_USE_MIN_MTU:
 		if (pktopt)
 			optdata = (void *)&pktopt->ip6po_minmtu;
 		else
 			optdata = (void *)&defminmtu;
 		optdatalen = sizeof(int);
 		break;
 	case IPV6_DONTFRAG:
 		if (pktopt && ((pktopt->ip6po_flags) & IP6PO_DONTFRAG))
 			on = 1;
 		else
 			on = 0;
 		optdata = (void *)&on;
 		optdatalen = sizeof(on);
 		break;
 	case IPV6_PREFER_TEMPADDR:
 		if (pktopt)
 			optdata = (void *)&pktopt->ip6po_prefer_tempaddr;
 		else
 			optdata = (void *)&defpreftemp;
 		optdatalen = sizeof(int);
 		break;
 	default:		/* should not happen */
 #ifdef DIAGNOSTIC
 		panic("ip6_getpcbopt: unexpected option\n");
 #endif
 		return (ENOPROTOOPT);
 	}
 
 	error = sooptcopyout(sopt, optdata, optdatalen);
 
 	return (error);
 }
 
 void
 ip6_clearpktopts(struct ip6_pktopts *pktopt, int optname)
 {
 	if (pktopt == NULL)
 		return;
 
 	if (optname == -1 || optname == IPV6_PKTINFO) {
 		if (pktopt->ip6po_pktinfo)
 			free(pktopt->ip6po_pktinfo, M_IP6OPT);
 		pktopt->ip6po_pktinfo = NULL;
 	}
 	if (optname == -1 || optname == IPV6_HOPLIMIT)
 		pktopt->ip6po_hlim = -1;
 	if (optname == -1 || optname == IPV6_TCLASS)
 		pktopt->ip6po_tclass = -1;
 	if (optname == -1 || optname == IPV6_NEXTHOP) {
 		if (pktopt->ip6po_nextroute.ro_rt) {
 			RTFREE(pktopt->ip6po_nextroute.ro_rt);
 			pktopt->ip6po_nextroute.ro_rt = NULL;
 		}
 		if (pktopt->ip6po_nexthop)
 			free(pktopt->ip6po_nexthop, M_IP6OPT);
 		pktopt->ip6po_nexthop = NULL;
 	}
 	if (optname == -1 || optname == IPV6_HOPOPTS) {
 		if (pktopt->ip6po_hbh)
 			free(pktopt->ip6po_hbh, M_IP6OPT);
 		pktopt->ip6po_hbh = NULL;
 	}
 	if (optname == -1 || optname == IPV6_RTHDRDSTOPTS) {
 		if (pktopt->ip6po_dest1)
 			free(pktopt->ip6po_dest1, M_IP6OPT);
 		pktopt->ip6po_dest1 = NULL;
 	}
 	if (optname == -1 || optname == IPV6_RTHDR) {
 		if (pktopt->ip6po_rhinfo.ip6po_rhi_rthdr)
 			free(pktopt->ip6po_rhinfo.ip6po_rhi_rthdr, M_IP6OPT);
 		pktopt->ip6po_rhinfo.ip6po_rhi_rthdr = NULL;
 		if (pktopt->ip6po_route.ro_rt) {
 			RTFREE(pktopt->ip6po_route.ro_rt);
 			pktopt->ip6po_route.ro_rt = NULL;
 		}
 	}
 	if (optname == -1 || optname == IPV6_DSTOPTS) {
 		if (pktopt->ip6po_dest2)
 			free(pktopt->ip6po_dest2, M_IP6OPT);
 		pktopt->ip6po_dest2 = NULL;
 	}
 }
 
 #define PKTOPT_EXTHDRCPY(type) \
 do {\
 	if (src->type) {\
 		int hlen = (((struct ip6_ext *)src->type)->ip6e_len + 1) << 3;\
 		dst->type = malloc(hlen, M_IP6OPT, canwait);\
 		if (dst->type == NULL && canwait == M_NOWAIT)\
 			goto bad;\
 		bcopy(src->type, dst->type, hlen);\
 	}\
 } while (/*CONSTCOND*/ 0)
 
 static int
 copypktopts(struct ip6_pktopts *dst, struct ip6_pktopts *src, int canwait)
 {
 	if (dst == NULL || src == NULL)  {
 		printf("ip6_clearpktopts: invalid argument\n");
 		return (EINVAL);
 	}
 
 	dst->ip6po_hlim = src->ip6po_hlim;
 	dst->ip6po_tclass = src->ip6po_tclass;
 	dst->ip6po_flags = src->ip6po_flags;
 	if (src->ip6po_pktinfo) {
 		dst->ip6po_pktinfo = malloc(sizeof(*dst->ip6po_pktinfo),
 		    M_IP6OPT, canwait);
 		if (dst->ip6po_pktinfo == NULL)
 			goto bad;
 		*dst->ip6po_pktinfo = *src->ip6po_pktinfo;
 	}
 	if (src->ip6po_nexthop) {
 		dst->ip6po_nexthop = malloc(src->ip6po_nexthop->sa_len,
 		    M_IP6OPT, canwait);
 		if (dst->ip6po_nexthop == NULL)
 			goto bad;
 		bcopy(src->ip6po_nexthop, dst->ip6po_nexthop,
 		    src->ip6po_nexthop->sa_len);
 	}
 	PKTOPT_EXTHDRCPY(ip6po_hbh);
 	PKTOPT_EXTHDRCPY(ip6po_dest1);
 	PKTOPT_EXTHDRCPY(ip6po_dest2);
 	PKTOPT_EXTHDRCPY(ip6po_rthdr); /* not copy the cached route */
 	return (0);
 
   bad:
 	ip6_clearpktopts(dst, -1);
 	return (ENOBUFS);
 }
 #undef PKTOPT_EXTHDRCPY
 
 struct ip6_pktopts *
 ip6_copypktopts(struct ip6_pktopts *src, int canwait)
 {
 	int error;
 	struct ip6_pktopts *dst;
 
 	dst = malloc(sizeof(*dst), M_IP6OPT, canwait);
 	if (dst == NULL)
 		return (NULL);
 	ip6_initpktopts(dst);
 
 	if ((error = copypktopts(dst, src, canwait)) != 0) {
 		free(dst, M_IP6OPT);
 		return (NULL);
 	}
 
 	return (dst);
 }
 
 void
 ip6_freepcbopts(struct ip6_pktopts *pktopt)
 {
 	if (pktopt == NULL)
 		return;
 
 	ip6_clearpktopts(pktopt, -1);
 
 	free(pktopt, M_IP6OPT);
 }
 
 /*
  * Set the IP6 multicast options in response to user setsockopt().
  */
 static int
 ip6_setmoptions(int optname, struct ip6_moptions **im6op, struct mbuf *m)
 {
 	INIT_VNET_NET(curvnet);
 	INIT_VNET_INET6(curvnet);
 	int error = 0;
 	u_int loop, ifindex;
 	struct ipv6_mreq *mreq;
 	struct ifnet *ifp;
 	struct ip6_moptions *im6o = *im6op;
 	struct route_in6 ro;
 	struct in6_multi_mship *imm;
 
 	if (im6o == NULL) {
 		/*
 		 * No multicast option buffer attached to the pcb;
 		 * allocate one and initialize to default values.
 		 */
 		im6o = (struct ip6_moptions *)
 			malloc(sizeof(*im6o), M_IP6MOPTS, M_WAITOK);
 
 		if (im6o == NULL)
 			return (ENOBUFS);
 		*im6op = im6o;
 		im6o->im6o_multicast_ifp = NULL;
 		im6o->im6o_multicast_hlim = V_ip6_defmcasthlim;
 		im6o->im6o_multicast_loop = IPV6_DEFAULT_MULTICAST_LOOP;
 		LIST_INIT(&im6o->im6o_memberships);
 	}
 
 	switch (optname) {
 
 	case IPV6_MULTICAST_IF:
 		/*
 		 * Select the interface for outgoing multicast packets.
 		 */
 		if (m == NULL || m->m_len != sizeof(u_int)) {
 			error = EINVAL;
 			break;
 		}
 		bcopy(mtod(m, u_int *), &ifindex, sizeof(ifindex));
 		if (ifindex < 0 || V_if_index < ifindex) {
 			error = ENXIO;	/* XXX EINVAL? */
 			break;
 		}
 		ifp = ifnet_byindex(ifindex);
 		if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) {
 			error = EADDRNOTAVAIL;
 			break;
 		}
 		im6o->im6o_multicast_ifp = ifp;
 		break;
 
 	case IPV6_MULTICAST_HOPS:
 	    {
 		/*
 		 * Set the IP6 hoplimit for outgoing multicast packets.
 		 */
 		int optval;
 		if (m == NULL || m->m_len != sizeof(int)) {
 			error = EINVAL;
 			break;
 		}
 		bcopy(mtod(m, u_int *), &optval, sizeof(optval));
 		if (optval < -1 || optval >= 256)
 			error = EINVAL;
 		else if (optval == -1)
 			im6o->im6o_multicast_hlim = V_ip6_defmcasthlim;
 		else
 			im6o->im6o_multicast_hlim = optval;
 		break;
 	    }
 
 	case IPV6_MULTICAST_LOOP:
 		/*
 		 * Set the loopback flag for outgoing multicast packets.
 		 * Must be zero or one.
 		 */
 		if (m == NULL || m->m_len != sizeof(u_int)) {
 			error = EINVAL;
 			break;
 		}
 		bcopy(mtod(m, u_int *), &loop, sizeof(loop));
 		if (loop > 1) {
 			error = EINVAL;
 			break;
 		}
 		im6o->im6o_multicast_loop = loop;
 		break;
 
 	case IPV6_JOIN_GROUP:
 		/*
 		 * Add a multicast group membership.
 		 * Group must be a valid IP6 multicast address.
 		 */
 		if (m == NULL || m->m_len != sizeof(struct ipv6_mreq)) {
 			error = EINVAL;
 			break;
 		}
 		mreq = mtod(m, struct ipv6_mreq *);
 
 		if (IN6_IS_ADDR_UNSPECIFIED(&mreq->ipv6mr_multiaddr)) {
 			/*
 			 * We use the unspecified address to specify to accept
 			 * all multicast addresses. Only super user is allowed
 			 * to do this.
 			 */
 			/* XXX-BZ might need a better PRIV_NETINET_x for this */
 			error = priv_check(curthread, PRIV_NETINET_MROUTE);
 			if (error)
 				break;
 		} else if (!IN6_IS_ADDR_MULTICAST(&mreq->ipv6mr_multiaddr)) {
 			error = EINVAL;
 			break;
 		}
 
 		/*
 		 * If no interface was explicitly specified, choose an
 		 * appropriate one according to the given multicast address.
 		 */
 		if (mreq->ipv6mr_interface == 0) {
 			struct sockaddr_in6 *dst;
 
 			/*
 			 * Look up the routing table for the
 			 * address, and choose the outgoing interface.
 			 *   XXX: is it a good approach?
 			 */
 			ro.ro_rt = NULL;
 			dst = (struct sockaddr_in6 *)&ro.ro_dst;
 			bzero(dst, sizeof(*dst));
 			dst->sin6_family = AF_INET6;
 			dst->sin6_len = sizeof(*dst);
 			dst->sin6_addr = mreq->ipv6mr_multiaddr;
 			rtalloc((struct route *)&ro);
 			if (ro.ro_rt == NULL) {
 				error = EADDRNOTAVAIL;
 				break;
 			}
 			ifp = ro.ro_rt->rt_ifp;
 			RTFREE(ro.ro_rt);
 		} else {
 			/*
 			 * If the interface is specified, validate it.
 			 */
 			if (mreq->ipv6mr_interface < 0 ||
 			    V_if_index < mreq->ipv6mr_interface) {
 				error = ENXIO;	/* XXX EINVAL? */
 				break;
 			}
 			ifp = ifnet_byindex(mreq->ipv6mr_interface);
 			if (!ifp) {
 				error = ENXIO;	/* XXX EINVAL? */
 				break;
 			}
 		}
 
 		/*
 		 * See if we found an interface, and confirm that it
 		 * supports multicast
 		 */
 		if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) {
 			error = EADDRNOTAVAIL;
 			break;
 		}
 
 		if (in6_setscope(&mreq->ipv6mr_multiaddr, ifp, NULL)) {
 			error = EADDRNOTAVAIL; /* XXX: should not happen */
 			break;
 		}
 
 		/*
 		 * See if the membership already exists.
 		 */
 		for (imm = im6o->im6o_memberships.lh_first;
 		     imm != NULL; imm = imm->i6mm_chain.le_next)
 			if (imm->i6mm_maddr->in6m_ifp == ifp &&
 			    IN6_ARE_ADDR_EQUAL(&imm->i6mm_maddr->in6m_addr,
 					       &mreq->ipv6mr_multiaddr))
 				break;
 		if (imm != NULL) {
 			error = EADDRINUSE;
 			break;
 		}
 		/*
 		 * Everything looks good; add a new record to the multicast
 		 * address list for the given interface.
 		 */
 		imm = in6_joingroup(ifp, &mreq->ipv6mr_multiaddr,  &error, 0);
 		if (imm == NULL)
 			break;
 		LIST_INSERT_HEAD(&im6o->im6o_memberships, imm, i6mm_chain);
 		break;
 
 	case IPV6_LEAVE_GROUP:
 		/*
 		 * Drop a multicast group membership.
 		 * Group must be a valid IP6 multicast address.
 		 */
 		if (m == NULL || m->m_len != sizeof(struct ipv6_mreq)) {
 			error = EINVAL;
 			break;
 		}
 		mreq = mtod(m, struct ipv6_mreq *);
 
 		/*
 		 * If an interface address was specified, get a pointer
 		 * to its ifnet structure.
 		 */
 		if (mreq->ipv6mr_interface < 0 ||
 		    V_if_index < mreq->ipv6mr_interface) {
 			error = ENXIO;	/* XXX EINVAL? */
 			break;
 		}
 		if (mreq->ipv6mr_interface == 0)
 			ifp = NULL;
 		else
 			ifp = ifnet_byindex(mreq->ipv6mr_interface);
 
 		/* Fill in the scope zone ID */
 		if (ifp) {
 			if (in6_setscope(&mreq->ipv6mr_multiaddr, ifp, NULL)) {
 				/* XXX: should not happen */
 				error = EADDRNOTAVAIL;
 				break;
 			}
 		} else if (mreq->ipv6mr_interface != 0) {
 			/*
 			 * This case happens when the (positive) index is in
 			 * the valid range, but the corresponding interface has
 			 * been detached dynamically (XXX).
 			 */
 			error = EADDRNOTAVAIL;
 			break;
 		} else {	/* ipv6mr_interface == 0 */
 			struct sockaddr_in6 sa6_mc;
 
 			/*
 			 * The API spec says as follows:
 			 *  If the interface index is specified as 0, the
 			 *  system may choose a multicast group membership to
 			 *  drop by matching the multicast address only.
 			 * On the other hand, we cannot disambiguate the scope
 			 * zone unless an interface is provided.  Thus, we
 			 * check if there's ambiguity with the default scope
 			 * zone as the last resort.
 			 */
 			bzero(&sa6_mc, sizeof(sa6_mc));
 			sa6_mc.sin6_family = AF_INET6;
 			sa6_mc.sin6_len = sizeof(sa6_mc);
 			sa6_mc.sin6_addr = mreq->ipv6mr_multiaddr;
 			error = sa6_embedscope(&sa6_mc, V_ip6_use_defzone);
 			if (error != 0)
 				break;
 			mreq->ipv6mr_multiaddr = sa6_mc.sin6_addr;
 		}
 
 		/*
 		 * Find the membership in the membership list.
 		 */
 		for (imm = im6o->im6o_memberships.lh_first;
 		     imm != NULL; imm = imm->i6mm_chain.le_next) {
 			if ((ifp == NULL || imm->i6mm_maddr->in6m_ifp == ifp) &&
 			    IN6_ARE_ADDR_EQUAL(&imm->i6mm_maddr->in6m_addr,
 			    &mreq->ipv6mr_multiaddr))
 				break;
 		}
 		if (imm == NULL) {
 			/* Unable to resolve interface */
 			error = EADDRNOTAVAIL;
 			break;
 		}
 		/*
 		 * Give up the multicast address record to which the
 		 * membership points.
 		 */
 		LIST_REMOVE(imm, i6mm_chain);
 		in6_delmulti(imm->i6mm_maddr);
 		free(imm, M_IP6MADDR);
 		break;
 
 	default:
 		error = EOPNOTSUPP;
 		break;
 	}
 
 	/*
 	 * If all options have default values, no need to keep the mbuf.
 	 */
 	if (im6o->im6o_multicast_ifp == NULL &&
 	    im6o->im6o_multicast_hlim == V_ip6_defmcasthlim &&
 	    im6o->im6o_multicast_loop == IPV6_DEFAULT_MULTICAST_LOOP &&
 	    im6o->im6o_memberships.lh_first == NULL) {
 		free(*im6op, M_IP6MOPTS);
 		*im6op = NULL;
 	}
 
 	return (error);
 }
 
 /*
  * Return the IP6 multicast options in response to user getsockopt().
  */
 static int
 ip6_getmoptions(int optname, struct ip6_moptions *im6o, struct mbuf **mp)
 {
 	INIT_VNET_INET6(curvnet);
 	u_int *hlim, *loop, *ifindex;
 
 	*mp = m_get(M_WAIT, MT_HEADER);		/* XXX */
 
 	switch (optname) {
 
 	case IPV6_MULTICAST_IF:
 		ifindex = mtod(*mp, u_int *);
 		(*mp)->m_len = sizeof(u_int);
 		if (im6o == NULL || im6o->im6o_multicast_ifp == NULL)
 			*ifindex = 0;
 		else
 			*ifindex = im6o->im6o_multicast_ifp->if_index;
 		return (0);
 
 	case IPV6_MULTICAST_HOPS:
 		hlim = mtod(*mp, u_int *);
 		(*mp)->m_len = sizeof(u_int);
 		if (im6o == NULL)
 			*hlim = V_ip6_defmcasthlim;
 		else
 			*hlim = im6o->im6o_multicast_hlim;
 		return (0);
 
 	case IPV6_MULTICAST_LOOP:
 		loop = mtod(*mp, u_int *);
 		(*mp)->m_len = sizeof(u_int);
 		if (im6o == NULL)
 			*loop = V_ip6_defmcasthlim;
 		else
 			*loop = im6o->im6o_multicast_loop;
 		return (0);
 
 	default:
 		return (EOPNOTSUPP);
 	}
 }
 
 /*
  * Discard the IP6 multicast options.
  */
 void
 ip6_freemoptions(struct ip6_moptions *im6o)
 {
 	struct in6_multi_mship *imm;
 
 	if (im6o == NULL)
 		return;
 
 	while ((imm = im6o->im6o_memberships.lh_first) != NULL) {
 		LIST_REMOVE(imm, i6mm_chain);
 		if (imm->i6mm_maddr)
 			in6_delmulti(imm->i6mm_maddr);
 		free(imm, M_IP6MADDR);
 	}
 	free(im6o, M_IP6MOPTS);
 }
 
 /*
  * Set IPv6 outgoing packet options based on advanced API.
  */
 int
 ip6_setpktopts(struct mbuf *control, struct ip6_pktopts *opt,
     struct ip6_pktopts *stickyopt, struct ucred *cred, int uproto)
 {
 	struct cmsghdr *cm = 0;
 
 	if (control == NULL || opt == NULL)
 		return (EINVAL);
 
 	ip6_initpktopts(opt);
 	if (stickyopt) {
 		int error;
 
 		/*
 		 * If stickyopt is provided, make a local copy of the options
 		 * for this particular packet, then override them by ancillary
 		 * objects.
 		 * XXX: copypktopts() does not copy the cached route to a next
 		 * hop (if any).  This is not very good in terms of efficiency,
 		 * but we can allow this since this option should be rarely
 		 * used.
 		 */
 		if ((error = copypktopts(opt, stickyopt, M_NOWAIT)) != 0)
 			return (error);
 	}
 
 	/*
 	 * XXX: Currently, we assume all the optional information is stored
 	 * in a single mbuf.
 	 */
 	if (control->m_next)
 		return (EINVAL);
 
 	for (; control->m_len > 0; control->m_data += CMSG_ALIGN(cm->cmsg_len),
 	    control->m_len -= CMSG_ALIGN(cm->cmsg_len)) {
 		int error;
 
 		if (control->m_len < CMSG_LEN(0))
 			return (EINVAL);
 
 		cm = mtod(control, struct cmsghdr *);
 		if (cm->cmsg_len == 0 || cm->cmsg_len > control->m_len)
 			return (EINVAL);
 		if (cm->cmsg_level != IPPROTO_IPV6)
 			continue;
 
 		error = ip6_setpktopt(cm->cmsg_type, CMSG_DATA(cm),
 		    cm->cmsg_len - CMSG_LEN(0), opt, cred, 0, 1, uproto);
 		if (error)
 			return (error);
 	}
 
 	return (0);
 }
 
 /*
  * Set a particular packet option, as a sticky option or an ancillary data
  * item.  "len" can be 0 only when it's a sticky option.
  * We have 4 cases of combination of "sticky" and "cmsg":
  * "sticky=0, cmsg=0": impossible
  * "sticky=0, cmsg=1": RFC2292 or RFC3542 ancillary data
  * "sticky=1, cmsg=0": RFC3542 socket option
  * "sticky=1, cmsg=1": RFC2292 socket option
  */
 static int
 ip6_setpktopt(int optname, u_char *buf, int len, struct ip6_pktopts *opt,
     struct ucred *cred, int sticky, int cmsg, int uproto)
 {
 	INIT_VNET_NET(curvnet);
 	INIT_VNET_INET6(curvnet);
 	int minmtupolicy, preftemp;
 	int error;
 
 	if (!sticky && !cmsg) {
 #ifdef DIAGNOSTIC
 		printf("ip6_setpktopt: impossible case\n");
 #endif
 		return (EINVAL);
 	}
 
 	/*
 	 * IPV6_2292xxx is for backward compatibility to RFC2292, and should
 	 * not be specified in the context of RFC3542.  Conversely,
 	 * RFC3542 types should not be specified in the context of RFC2292.
 	 */
 	if (!cmsg) {
 		switch (optname) {
 		case IPV6_2292PKTINFO:
 		case IPV6_2292HOPLIMIT:
 		case IPV6_2292NEXTHOP:
 		case IPV6_2292HOPOPTS:
 		case IPV6_2292DSTOPTS:
 		case IPV6_2292RTHDR:
 		case IPV6_2292PKTOPTIONS:
 			return (ENOPROTOOPT);
 		}
 	}
 	if (sticky && cmsg) {
 		switch (optname) {
 		case IPV6_PKTINFO:
 		case IPV6_HOPLIMIT:
 		case IPV6_NEXTHOP:
 		case IPV6_HOPOPTS:
 		case IPV6_DSTOPTS:
 		case IPV6_RTHDRDSTOPTS:
 		case IPV6_RTHDR:
 		case IPV6_USE_MIN_MTU:
 		case IPV6_DONTFRAG:
 		case IPV6_TCLASS:
 		case IPV6_PREFER_TEMPADDR: /* XXX: not an RFC3542 option */
 			return (ENOPROTOOPT);
 		}
 	}
 
 	switch (optname) {
 	case IPV6_2292PKTINFO:
 	case IPV6_PKTINFO:
 	{
 		struct ifnet *ifp = NULL;
 		struct in6_pktinfo *pktinfo;
 
 		if (len != sizeof(struct in6_pktinfo))
 			return (EINVAL);
 
 		pktinfo = (struct in6_pktinfo *)buf;
 
 		/*
 		 * An application can clear any sticky IPV6_PKTINFO option by
 		 * doing a "regular" setsockopt with ipi6_addr being
 		 * in6addr_any and ipi6_ifindex being zero.
 		 * [RFC 3542, Section 6]
 		 */
 		if (optname == IPV6_PKTINFO && opt->ip6po_pktinfo &&
 		    pktinfo->ipi6_ifindex == 0 &&
 		    IN6_IS_ADDR_UNSPECIFIED(&pktinfo->ipi6_addr)) {
 			ip6_clearpktopts(opt, optname);
 			break;
 		}
 
 		if (uproto == IPPROTO_TCP && optname == IPV6_PKTINFO &&
 		    sticky && !IN6_IS_ADDR_UNSPECIFIED(&pktinfo->ipi6_addr)) {
 			return (EINVAL);
 		}
 
 		/* validate the interface index if specified. */
 		if (pktinfo->ipi6_ifindex > V_if_index ||
 		    pktinfo->ipi6_ifindex < 0) {
 			 return (ENXIO);
 		}
 		if (pktinfo->ipi6_ifindex) {
 			ifp = ifnet_byindex(pktinfo->ipi6_ifindex);
 			if (ifp == NULL)
 				return (ENXIO);
 		}
 
 		/*
 		 * We store the address anyway, and let in6_selectsrc()
 		 * validate the specified address.  This is because ipi6_addr
 		 * may not have enough information about its scope zone, and
 		 * we may need additional information (such as outgoing
 		 * interface or the scope zone of a destination address) to
 		 * disambiguate the scope.
 		 * XXX: the delay of the validation may confuse the
 		 * application when it is used as a sticky option.
 		 */
 		if (opt->ip6po_pktinfo == NULL) {
 			opt->ip6po_pktinfo = malloc(sizeof(*pktinfo),
 			    M_IP6OPT, M_NOWAIT);
 			if (opt->ip6po_pktinfo == NULL)
 				return (ENOBUFS);
 		}
 		bcopy(pktinfo, opt->ip6po_pktinfo, sizeof(*pktinfo));
 		break;
 	}
 
 	case IPV6_2292HOPLIMIT:
 	case IPV6_HOPLIMIT:
 	{
 		int *hlimp;
 
 		/*
 		 * RFC 3542 deprecated the usage of sticky IPV6_HOPLIMIT
 		 * to simplify the ordering among hoplimit options.
 		 */
 		if (optname == IPV6_HOPLIMIT && sticky)
 			return (ENOPROTOOPT);
 
 		if (len != sizeof(int))
 			return (EINVAL);
 		hlimp = (int *)buf;
 		if (*hlimp < -1 || *hlimp > 255)
 			return (EINVAL);
 
 		opt->ip6po_hlim = *hlimp;
 		break;
 	}
 
 	case IPV6_TCLASS:
 	{
 		int tclass;
 
 		if (len != sizeof(int))
 			return (EINVAL);
 		tclass = *(int *)buf;
 		if (tclass < -1 || tclass > 255)
 			return (EINVAL);
 
 		opt->ip6po_tclass = tclass;
 		break;
 	}
 
 	case IPV6_2292NEXTHOP:
 	case IPV6_NEXTHOP:
 		if (cred != NULL) {
 			error = priv_check_cred(cred,
 			    PRIV_NETINET_SETHDROPTS, 0);
 			if (error)
 				return (error);
 		}
 
 		if (len == 0) {	/* just remove the option */
 			ip6_clearpktopts(opt, IPV6_NEXTHOP);
 			break;
 		}
 
 		/* check if cmsg_len is large enough for sa_len */
 		if (len < sizeof(struct sockaddr) || len < *buf)
 			return (EINVAL);
 
 		switch (((struct sockaddr *)buf)->sa_family) {
 		case AF_INET6:
 		{
 			struct sockaddr_in6 *sa6 = (struct sockaddr_in6 *)buf;
 			int error;
 
 			if (sa6->sin6_len != sizeof(struct sockaddr_in6))
 				return (EINVAL);
 
 			if (IN6_IS_ADDR_UNSPECIFIED(&sa6->sin6_addr) ||
 			    IN6_IS_ADDR_MULTICAST(&sa6->sin6_addr)) {
 				return (EINVAL);
 			}
 			if ((error = sa6_embedscope(sa6, V_ip6_use_defzone))
 			    != 0) {
 				return (error);
 			}
 			break;
 		}
 		case AF_LINK:	/* should eventually be supported */
 		default:
 			return (EAFNOSUPPORT);
 		}
 
 		/* turn off the previous option, then set the new option. */
 		ip6_clearpktopts(opt, IPV6_NEXTHOP);
 		opt->ip6po_nexthop = malloc(*buf, M_IP6OPT, M_NOWAIT);
 		if (opt->ip6po_nexthop == NULL)
 			return (ENOBUFS);
 		bcopy(buf, opt->ip6po_nexthop, *buf);
 		break;
 
 	case IPV6_2292HOPOPTS:
 	case IPV6_HOPOPTS:
 	{
 		struct ip6_hbh *hbh;
 		int hbhlen;
 
 		/*
 		 * XXX: We don't allow a non-privileged user to set ANY HbH
 		 * options, since per-option restriction has too much
 		 * overhead.
 		 */
 		if (cred != NULL) {
 			error = priv_check_cred(cred,
 			    PRIV_NETINET_SETHDROPTS, 0);
 			if (error)
 				return (error);
 		}
 
 		if (len == 0) {
 			ip6_clearpktopts(opt, IPV6_HOPOPTS);
 			break;	/* just remove the option */
 		}
 
 		/* message length validation */
 		if (len < sizeof(struct ip6_hbh))
 			return (EINVAL);
 		hbh = (struct ip6_hbh *)buf;
 		hbhlen = (hbh->ip6h_len + 1) << 3;
 		if (len != hbhlen)
 			return (EINVAL);
 
 		/* turn off the previous option, then set the new option. */
 		ip6_clearpktopts(opt, IPV6_HOPOPTS);
 		opt->ip6po_hbh = malloc(hbhlen, M_IP6OPT, M_NOWAIT);
 		if (opt->ip6po_hbh == NULL)
 			return (ENOBUFS);
 		bcopy(hbh, opt->ip6po_hbh, hbhlen);
 
 		break;
 	}
 
 	case IPV6_2292DSTOPTS:
 	case IPV6_DSTOPTS:
 	case IPV6_RTHDRDSTOPTS:
 	{
 		struct ip6_dest *dest, **newdest = NULL;
 		int destlen;
 
 		if (cred != NULL) { /* XXX: see the comment for IPV6_HOPOPTS */
 			error = priv_check_cred(cred,
 			    PRIV_NETINET_SETHDROPTS, 0);
 			if (error)
 				return (error);
 		}
 
 		if (len == 0) {
 			ip6_clearpktopts(opt, optname);
 			break;	/* just remove the option */
 		}
 
 		/* message length validation */
 		if (len < sizeof(struct ip6_dest))
 			return (EINVAL);
 		dest = (struct ip6_dest *)buf;
 		destlen = (dest->ip6d_len + 1) << 3;
 		if (len != destlen)
 			return (EINVAL);
 
 		/*
 		 * Determine the position that the destination options header
 		 * should be inserted; before or after the routing header.
 		 */
 		switch (optname) {
 		case IPV6_2292DSTOPTS:
 			/*
 			 * The old advacned API is ambiguous on this point.
 			 * Our approach is to determine the position based
 			 * according to the existence of a routing header.
 			 * Note, however, that this depends on the order of the
 			 * extension headers in the ancillary data; the 1st
 			 * part of the destination options header must appear
 			 * before the routing header in the ancillary data,
 			 * too.
 			 * RFC3542 solved the ambiguity by introducing
 			 * separate ancillary data or option types.
 			 */
 			if (opt->ip6po_rthdr == NULL)
 				newdest = &opt->ip6po_dest1;
 			else
 				newdest = &opt->ip6po_dest2;
 			break;
 		case IPV6_RTHDRDSTOPTS:
 			newdest = &opt->ip6po_dest1;
 			break;
 		case IPV6_DSTOPTS:
 			newdest = &opt->ip6po_dest2;
 			break;
 		}
 
 		/* turn off the previous option, then set the new option. */
 		ip6_clearpktopts(opt, optname);
 		*newdest = malloc(destlen, M_IP6OPT, M_NOWAIT);
 		if (*newdest == NULL)
 			return (ENOBUFS);
 		bcopy(dest, *newdest, destlen);
 
 		break;
 	}
 
 	case IPV6_2292RTHDR:
 	case IPV6_RTHDR:
 	{
 		struct ip6_rthdr *rth;
 		int rthlen;
 
 		if (len == 0) {
 			ip6_clearpktopts(opt, IPV6_RTHDR);
 			break;	/* just remove the option */
 		}
 
 		/* message length validation */
 		if (len < sizeof(struct ip6_rthdr))
 			return (EINVAL);
 		rth = (struct ip6_rthdr *)buf;
 		rthlen = (rth->ip6r_len + 1) << 3;
 		if (len != rthlen)
 			return (EINVAL);
 
 		switch (rth->ip6r_type) {
 		case IPV6_RTHDR_TYPE_0:
 			if (rth->ip6r_len == 0)	/* must contain one addr */
 				return (EINVAL);
 			if (rth->ip6r_len % 2) /* length must be even */
 				return (EINVAL);
 			if (rth->ip6r_len / 2 != rth->ip6r_segleft)
 				return (EINVAL);
 			break;
 		default:
 			return (EINVAL);	/* not supported */
 		}
 
 		/* turn off the previous option */
 		ip6_clearpktopts(opt, IPV6_RTHDR);
 		opt->ip6po_rthdr = malloc(rthlen, M_IP6OPT, M_NOWAIT);
 		if (opt->ip6po_rthdr == NULL)
 			return (ENOBUFS);
 		bcopy(rth, opt->ip6po_rthdr, rthlen);
 
 		break;
 	}
 
 	case IPV6_USE_MIN_MTU:
 		if (len != sizeof(int))
 			return (EINVAL);
 		minmtupolicy = *(int *)buf;
 		if (minmtupolicy != IP6PO_MINMTU_MCASTONLY &&
 		    minmtupolicy != IP6PO_MINMTU_DISABLE &&
 		    minmtupolicy != IP6PO_MINMTU_ALL) {
 			return (EINVAL);
 		}
 		opt->ip6po_minmtu = minmtupolicy;
 		break;
 
 	case IPV6_DONTFRAG:
 		if (len != sizeof(int))
 			return (EINVAL);
 
 		if (uproto == IPPROTO_TCP || *(int *)buf == 0) {
 			/*
 			 * we ignore this option for TCP sockets.
 			 * (RFC3542 leaves this case unspecified.)
 			 */
 			opt->ip6po_flags &= ~IP6PO_DONTFRAG;
 		} else
 			opt->ip6po_flags |= IP6PO_DONTFRAG;
 		break;
 
 	case IPV6_PREFER_TEMPADDR:
 		if (len != sizeof(int))
 			return (EINVAL);
 		preftemp = *(int *)buf;
 		if (preftemp != IP6PO_TEMPADDR_SYSTEM &&
 		    preftemp != IP6PO_TEMPADDR_NOTPREFER &&
 		    preftemp != IP6PO_TEMPADDR_PREFER) {
 			return (EINVAL);
 		}
 		opt->ip6po_prefer_tempaddr = preftemp;
 		break;
 
 	default:
 		return (ENOPROTOOPT);
 	} /* end of switch */
 
 	return (0);
 }
 
 /*
  * Routine called from ip6_output() to loop back a copy of an IP6 multicast
  * packet to the input queue of a specified interface.  Note that this
  * calls the output routine of the loopback "driver", but with an interface
  * pointer that might NOT be &loif -- easier than replicating that code here.
  */
 void
 ip6_mloopback(struct ifnet *ifp, struct mbuf *m, struct sockaddr_in6 *dst)
 {
 	struct mbuf *copym;
 	struct ip6_hdr *ip6;
 
 	copym = m_copy(m, 0, M_COPYALL);
 	if (copym == NULL)
 		return;
 
 	/*
 	 * Make sure to deep-copy IPv6 header portion in case the data
 	 * is in an mbuf cluster, so that we can safely override the IPv6
 	 * header portion later.
 	 */
 	if ((copym->m_flags & M_EXT) != 0 ||
 	    copym->m_len < sizeof(struct ip6_hdr)) {
 		copym = m_pullup(copym, sizeof(struct ip6_hdr));
 		if (copym == NULL)
 			return;
 	}
 
 #ifdef DIAGNOSTIC
 	if (copym->m_len < sizeof(*ip6)) {
 		m_freem(copym);
 		return;
 	}
 #endif
 
 	ip6 = mtod(copym, struct ip6_hdr *);
 	/*
 	 * clear embedded scope identifiers if necessary.
 	 * in6_clearscope will touch the addresses only when necessary.
 	 */
 	in6_clearscope(&ip6->ip6_src);
 	in6_clearscope(&ip6->ip6_dst);
 
 	(void)if_simloop(ifp, copym, dst->sin6_family, 0);
 }
 
 /*
  * Chop IPv6 header off from the payload.
  */
 static int
 ip6_splithdr(struct mbuf *m, struct ip6_exthdrs *exthdrs)
 {
 	struct mbuf *mh;
 	struct ip6_hdr *ip6;
 
 	ip6 = mtod(m, struct ip6_hdr *);
 	if (m->m_len > sizeof(*ip6)) {
 		MGETHDR(mh, M_DONTWAIT, MT_HEADER);
 		if (mh == 0) {
 			m_freem(m);
 			return ENOBUFS;
 		}
 		M_MOVE_PKTHDR(mh, m);
 		MH_ALIGN(mh, sizeof(*ip6));
 		m->m_len -= sizeof(*ip6);
 		m->m_data += sizeof(*ip6);
 		mh->m_next = m;
 		m = mh;
 		m->m_len = sizeof(*ip6);
 		bcopy((caddr_t)ip6, mtod(m, caddr_t), sizeof(*ip6));
 	}
 	exthdrs->ip6e_ip6 = m;
 	return 0;
 }
 
 /*
  * Compute IPv6 extension header length.
  */
 int
 ip6_optlen(struct inpcb *in6p)
 {
 	int len;
 
 	if (!in6p->in6p_outputopts)
 		return 0;
 
 	len = 0;
 #define elen(x) \
     (((struct ip6_ext *)(x)) ? (((struct ip6_ext *)(x))->ip6e_len + 1) << 3 : 0)
 
 	len += elen(in6p->in6p_outputopts->ip6po_hbh);
 	if (in6p->in6p_outputopts->ip6po_rthdr)
 		/* dest1 is valid with rthdr only */
 		len += elen(in6p->in6p_outputopts->ip6po_dest1);
 	len += elen(in6p->in6p_outputopts->ip6po_rthdr);
 	len += elen(in6p->in6p_outputopts->ip6po_dest2);
 	return len;
 #undef elen
 }