Index: head/sys/kern/uipc_sockbuf.c
===================================================================
--- head/sys/kern/uipc_sockbuf.c	(revision 167714)
+++ head/sys/kern/uipc_sockbuf.c	(revision 167715)
@@ -1,947 +1,988 @@
 /*-
  * Copyright (c) 1982, 1986, 1988, 1990, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)uipc_socket2.c	8.1 (Berkeley) 6/10/93
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_param.h"
 
 #include <sys/param.h>
 #include <sys/aio.h> /* for aio_swake proto */
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mbuf.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/protosw.h>
 #include <sys/resourcevar.h>
 #include <sys/signalvar.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
 
 /*
  * Function pointer set by the AIO routines so that the socket buffer code
  * can call back into the AIO module if it is loaded.
  */
 void	(*aio_swake)(struct socket *, struct sockbuf *);
 
 /*
  * Primitive routines for operating on socket buffers
  */
 
 u_long	sb_max = SB_MAX;
 static	u_long sb_max_adj =
     SB_MAX * MCLBYTES / (MSIZE + MCLBYTES); /* adjusted sb_max */
 
 static	u_long sb_efficiency = 8;	/* parameter for sbreserve() */
 
 static void	sbdrop_internal(struct sockbuf *sb, int len);
 static void	sbflush_internal(struct sockbuf *sb);
 static void	sbrelease_internal(struct sockbuf *sb, struct socket *so);
 
 /*
  * Socantsendmore indicates that no more data will be sent on the socket; it
  * would normally be applied to a socket when the user informs the system
  * that no more data is to be sent, by the protocol code (in case
  * PRU_SHUTDOWN).  Socantrcvmore indicates that no more data will be
  * received, and will normally be applied to the socket by a protocol when it
  * detects that the peer will send no more data.  Data queued for reading in
  * the socket may yet be read.
  */
 void
 socantsendmore_locked(struct socket *so)
 {
 
 	SOCKBUF_LOCK_ASSERT(&so->so_snd);
 
 	so->so_snd.sb_state |= SBS_CANTSENDMORE;
 	sowwakeup_locked(so);
 	mtx_assert(SOCKBUF_MTX(&so->so_snd), MA_NOTOWNED);
 }
 
 void
 socantsendmore(struct socket *so)
 {
 
 	SOCKBUF_LOCK(&so->so_snd);
 	socantsendmore_locked(so);
 	mtx_assert(SOCKBUF_MTX(&so->so_snd), MA_NOTOWNED);
 }
 
 void
 socantrcvmore_locked(struct socket *so)
 {
 
 	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
 
 	so->so_rcv.sb_state |= SBS_CANTRCVMORE;
 	sorwakeup_locked(so);
 	mtx_assert(SOCKBUF_MTX(&so->so_rcv), MA_NOTOWNED);
 }
 
 void
 socantrcvmore(struct socket *so)
 {
 
 	SOCKBUF_LOCK(&so->so_rcv);
 	socantrcvmore_locked(so);
 	mtx_assert(SOCKBUF_MTX(&so->so_rcv), MA_NOTOWNED);
 }
 
 /*
  * Wait for data to arrive at/drain from a socket buffer.
  */
 int
 sbwait(struct sockbuf *sb)
 {
 
 	SOCKBUF_LOCK_ASSERT(sb);
 
 	sb->sb_flags |= SB_WAIT;
 	return (msleep(&sb->sb_cc, &sb->sb_mtx,
 	    (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK | PCATCH, "sbwait",
 	    sb->sb_timeo));
 }
 
 /*
  * Lock a sockbuf already known to be locked; return any error returned from
  * sleep (EINTR).
  */
 int
 sb_lock(struct sockbuf *sb)
 {
 	int error;
 
 	SOCKBUF_LOCK_ASSERT(sb);
 
 	while (sb->sb_flags & SB_LOCK) {
 		sb->sb_flags |= SB_WANT;
 		error = msleep(&sb->sb_flags, &sb->sb_mtx,
 		    (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK|PCATCH,
 		    "sblock", 0);
 		if (error)
 			return (error);
 	}
 	sb->sb_flags |= SB_LOCK;
 	return (0);
 }
 
 /*
  * Wakeup processes waiting on a socket buffer.  Do asynchronous notification
  * via SIGIO if the socket has the SS_ASYNC flag set.
  *
  * Called with the socket buffer lock held; will release the lock by the end
  * of the function.  This allows the caller to acquire the socket buffer lock
  * while testing for the need for various sorts of wakeup and hold it through
  * to the point where it's no longer required.  We currently hold the lock
  * through calls out to other subsystems (with the exception of kqueue), and
  * then release it to avoid lock order issues.  It's not clear that's
  * correct.
  */
 void
 sowakeup(struct socket *so, struct sockbuf *sb)
 {
 
 	SOCKBUF_LOCK_ASSERT(sb);
 
 	selwakeuppri(&sb->sb_sel, PSOCK);
 	sb->sb_flags &= ~SB_SEL;
 	if (sb->sb_flags & SB_WAIT) {
 		sb->sb_flags &= ~SB_WAIT;
 		wakeup(&sb->sb_cc);
 	}
 	KNOTE_LOCKED(&sb->sb_sel.si_note, 0);
 	SOCKBUF_UNLOCK(sb);
 	if ((so->so_state & SS_ASYNC) && so->so_sigio != NULL)
 		pgsigio(&so->so_sigio, SIGIO, 0);
 	if (sb->sb_flags & SB_UPCALL)
 		(*so->so_upcall)(so, so->so_upcallarg, M_DONTWAIT);
 	if (sb->sb_flags & SB_AIO)
 		aio_swake(so, sb);
 	mtx_assert(SOCKBUF_MTX(sb), MA_NOTOWNED);
 }
 
 /*
  * Socket buffer (struct sockbuf) utility routines.
  *
  * Each socket contains two socket buffers: one for sending data and one for
  * receiving data.  Each buffer contains a queue of mbufs, information about
  * the number of mbufs and amount of data in the queue, and other fields
  * allowing select() statements and notification on data availability to be
  * implemented.
  *
  * Data stored in a socket buffer is maintained as a list of records.  Each
  * record is a list of mbufs chained together with the m_next field.  Records
  * are chained together with the m_nextpkt field. The upper level routine
  * soreceive() expects the following conventions to be observed when placing
  * information in the receive buffer:
  *
  * 1. If the protocol requires each message be preceded by the sender's name,
  *    then a record containing that name must be present before any
  *    associated data (mbuf's must be of type MT_SONAME).
  * 2. If the protocol supports the exchange of ``access rights'' (really just
  *    additional data associated with the message), and there are ``rights''
  *    to be received, then a record containing this data should be present
  *    (mbuf's must be of type MT_RIGHTS).
  * 3. If a name or rights record exists, then it must be followed by a data
  *    record, perhaps of zero length.
  *
  * Before using a new socket structure it is first necessary to reserve
  * buffer space to the socket, by calling sbreserve().  This should commit
  * some of the available buffer space in the system buffer pool for the
  * socket (currently, it does nothing but enforce limits).  The space should
  * be released by calling sbrelease() when the socket is destroyed.
  */
 int
 soreserve(struct socket *so, u_long sndcc, u_long rcvcc)
 {
 	struct thread *td = curthread;
 
 	SOCKBUF_LOCK(&so->so_snd);
 	SOCKBUF_LOCK(&so->so_rcv);
 	if (sbreserve_locked(&so->so_snd, sndcc, so, td) == 0)
 		goto bad;
 	if (sbreserve_locked(&so->so_rcv, rcvcc, so, td) == 0)
 		goto bad2;
 	if (so->so_rcv.sb_lowat == 0)
 		so->so_rcv.sb_lowat = 1;
 	if (so->so_snd.sb_lowat == 0)
 		so->so_snd.sb_lowat = MCLBYTES;
 	if (so->so_snd.sb_lowat > so->so_snd.sb_hiwat)
 		so->so_snd.sb_lowat = so->so_snd.sb_hiwat;
 	SOCKBUF_UNLOCK(&so->so_rcv);
 	SOCKBUF_UNLOCK(&so->so_snd);
 	return (0);
 bad2:
 	sbrelease_locked(&so->so_snd, so);
 bad:
 	SOCKBUF_UNLOCK(&so->so_rcv);
 	SOCKBUF_UNLOCK(&so->so_snd);
 	return (ENOBUFS);
 }
 
 static int
 sysctl_handle_sb_max(SYSCTL_HANDLER_ARGS)
 {
 	int error = 0;
 	u_long tmp_sb_max = sb_max;
 
 	error = sysctl_handle_long(oidp, &tmp_sb_max, arg2, req);
 	if (error || !req->newptr)
 		return (error);
 	if (tmp_sb_max < MSIZE + MCLBYTES)
 		return (EINVAL);
 	sb_max = tmp_sb_max;
 	sb_max_adj = (u_quad_t)sb_max * MCLBYTES / (MSIZE + MCLBYTES);
 	return (0);
 }
 	
 /*
  * Allot mbufs to a sockbuf.  Attempt to scale mbmax so that mbcnt doesn't
  * become limiting if buffering efficiency is near the normal case.
  */
 int
 sbreserve_locked(struct sockbuf *sb, u_long cc, struct socket *so,
     struct thread *td)
 {
 	rlim_t sbsize_limit;
 
 	SOCKBUF_LOCK_ASSERT(sb);
 
 	/*
 	 * td will only be NULL when we're in an interrupt (e.g. in
 	 * tcp_input()).
 	 *
 	 * XXXRW: This comment needs updating, as might the code.
 	 */
 	if (cc > sb_max_adj)
 		return (0);
 	if (td != NULL) {
 		PROC_LOCK(td->td_proc);
 		sbsize_limit = lim_cur(td->td_proc, RLIMIT_SBSIZE);
 		PROC_UNLOCK(td->td_proc);
 	} else
 		sbsize_limit = RLIM_INFINITY;
 	if (!chgsbsize(so->so_cred->cr_uidinfo, &sb->sb_hiwat, cc,
 	    sbsize_limit))
 		return (0);
 	sb->sb_mbmax = min(cc * sb_efficiency, sb_max);
 	if (sb->sb_lowat > sb->sb_hiwat)
 		sb->sb_lowat = sb->sb_hiwat;
 	return (1);
 }
 
 int
 sbreserve(struct sockbuf *sb, u_long cc, struct socket *so, 
     struct thread *td)
 {
 	int error;
 
 	SOCKBUF_LOCK(sb);
 	error = sbreserve_locked(sb, cc, so, td);
 	SOCKBUF_UNLOCK(sb);
 	return (error);
 }
 
 /*
  * Free mbufs held by a socket, and reserved mbuf space.
  */
 static void
 sbrelease_internal(struct sockbuf *sb, struct socket *so)
 {
 
 	sbflush_internal(sb);
 	(void)chgsbsize(so->so_cred->cr_uidinfo, &sb->sb_hiwat, 0,
 	    RLIM_INFINITY);
 	sb->sb_mbmax = 0;
 }
 
 void
 sbrelease_locked(struct sockbuf *sb, struct socket *so)
 {
 
 	SOCKBUF_LOCK_ASSERT(sb);
 
 	sbrelease_internal(sb, so);
 }
 
 void
 sbrelease(struct sockbuf *sb, struct socket *so)
 {
 
 	SOCKBUF_LOCK(sb);
 	sbrelease_locked(sb, so);
 	SOCKBUF_UNLOCK(sb);
 }
 
 void
 sbdestroy(struct sockbuf *sb, struct socket *so)
 {
 
 	sbrelease_internal(sb, so);
 }
 
 
 /*
  * Routines to add and remove data from an mbuf queue.
  *
  * The routines sbappend() or sbappendrecord() are normally called to append
  * new mbufs to a socket buffer, after checking that adequate space is
  * available, comparing the function sbspace() with the amount of data to be
  * added.  sbappendrecord() differs from sbappend() in that data supplied is
  * treated as the beginning of a new record.  To place a sender's address,
  * optional access rights, and data in a socket receive buffer,
  * sbappendaddr() should be used.  To place access rights and data in a
  * socket receive buffer, sbappendrights() should be used.  In either case,
  * the new data begins a new record.  Note that unlike sbappend() and
  * sbappendrecord(), these routines check for the caller that there will be
  * enough space to store the data.  Each fails if there is not enough space,
  * or if it cannot find mbufs to store additional information in.
  *
  * Reliable protocols may use the socket send buffer to hold data awaiting
  * acknowledgement.  Data is normally copied from a socket send buffer in a
  * protocol with m_copy for output to a peer, and then removing the data from
  * the socket buffer with sbdrop() or sbdroprecord() when the data is
  * acknowledged by the peer.
  */
 #ifdef SOCKBUF_DEBUG
 void
 sblastrecordchk(struct sockbuf *sb, const char *file, int line)
 {
 	struct mbuf *m = sb->sb_mb;
 
 	SOCKBUF_LOCK_ASSERT(sb);
 
 	while (m && m->m_nextpkt)
 		m = m->m_nextpkt;
 
 	if (m != sb->sb_lastrecord) {
 		printf("%s: sb_mb %p sb_lastrecord %p last %p\n",
 			__func__, sb->sb_mb, sb->sb_lastrecord, m);
 		printf("packet chain:\n");
 		for (m = sb->sb_mb; m != NULL; m = m->m_nextpkt)
 			printf("\t%p\n", m);
 		panic("%s from %s:%u", __func__, file, line);
 	}
 }
 
 void
 sblastmbufchk(struct sockbuf *sb, const char *file, int line)
 {
 	struct mbuf *m = sb->sb_mb;
 	struct mbuf *n;
 
 	SOCKBUF_LOCK_ASSERT(sb);
 
 	while (m && m->m_nextpkt)
 		m = m->m_nextpkt;
 
 	while (m && m->m_next)
 		m = m->m_next;
 
 	if (m != sb->sb_mbtail) {
 		printf("%s: sb_mb %p sb_mbtail %p last %p\n",
 			__func__, sb->sb_mb, sb->sb_mbtail, m);
 		printf("packet tree:\n");
 		for (m = sb->sb_mb; m != NULL; m = m->m_nextpkt) {
 			printf("\t");
 			for (n = m; n != NULL; n = n->m_next)
 				printf("%p ", n);
 			printf("\n");
 		}
 		panic("%s from %s:%u", __func__, file, line);
 	}
 }
 #endif /* SOCKBUF_DEBUG */
 
 #define SBLINKRECORD(sb, m0) do {					\
 	SOCKBUF_LOCK_ASSERT(sb);					\
 	if ((sb)->sb_lastrecord != NULL)				\
 		(sb)->sb_lastrecord->m_nextpkt = (m0);			\
 	else								\
 		(sb)->sb_mb = (m0);					\
 	(sb)->sb_lastrecord = (m0);					\
 } while (/*CONSTCOND*/0)
 
 /*
  * Append mbuf chain m to the last record in the socket buffer sb.  The
  * additional space associated the mbuf chain is recorded in sb.  Empty mbufs
  * are discarded and mbufs are compacted where possible.
  */
 void
 sbappend_locked(struct sockbuf *sb, struct mbuf *m)
 {
 	struct mbuf *n;
 
 	SOCKBUF_LOCK_ASSERT(sb);
 
 	if (m == 0)
 		return;
 
 	SBLASTRECORDCHK(sb);
 	n = sb->sb_mb;
 	if (n) {
 		while (n->m_nextpkt)
 			n = n->m_nextpkt;
 		do {
 			if (n->m_flags & M_EOR) {
 				sbappendrecord_locked(sb, m); /* XXXXXX!!!! */
 				return;
 			}
 		} while (n->m_next && (n = n->m_next));
 	} else {
 		/*
 		 * XXX Would like to simply use sb_mbtail here, but
 		 * XXX I need to verify that I won't miss an EOR that
 		 * XXX way.
 		 */
 		if ((n = sb->sb_lastrecord) != NULL) {
 			do {
 				if (n->m_flags & M_EOR) {
 					sbappendrecord_locked(sb, m); /* XXXXXX!!!! */
 					return;
 				}
 			} while (n->m_next && (n = n->m_next));
 		} else {
 			/*
 			 * If this is the first record in the socket buffer,
 			 * it's also the last record.
 			 */
 			sb->sb_lastrecord = m;
 		}
 	}
 	sbcompress(sb, m, n);
 	SBLASTRECORDCHK(sb);
 }
 
 /*
  * Append mbuf chain m to the last record in the socket buffer sb.  The
  * additional space associated the mbuf chain is recorded in sb.  Empty mbufs
  * are discarded and mbufs are compacted where possible.
  */
 void
 sbappend(struct sockbuf *sb, struct mbuf *m)
 {
 
 	SOCKBUF_LOCK(sb);
 	sbappend_locked(sb, m);
 	SOCKBUF_UNLOCK(sb);
 }
 
 /*
  * This version of sbappend() should only be used when the caller absolutely
  * knows that there will never be more than one record in the socket buffer,
  * that is, a stream protocol (such as TCP).
  */
 void
 sbappendstream_locked(struct sockbuf *sb, struct mbuf *m)
 {
 	SOCKBUF_LOCK_ASSERT(sb);
 
 	KASSERT(m->m_nextpkt == NULL,("sbappendstream 0"));
 	KASSERT(sb->sb_mb == sb->sb_lastrecord,("sbappendstream 1"));
 
 	SBLASTMBUFCHK(sb);
 
 	sbcompress(sb, m, sb->sb_mbtail);
 
 	sb->sb_lastrecord = sb->sb_mb;
 	SBLASTRECORDCHK(sb);
 }
 
 /*
  * This version of sbappend() should only be used when the caller absolutely
  * knows that there will never be more than one record in the socket buffer,
  * that is, a stream protocol (such as TCP).
  */
 void
 sbappendstream(struct sockbuf *sb, struct mbuf *m)
 {
 
 	SOCKBUF_LOCK(sb);
 	sbappendstream_locked(sb, m);
 	SOCKBUF_UNLOCK(sb);
 }
 
 #ifdef SOCKBUF_DEBUG
 void
 sbcheck(struct sockbuf *sb)
 {
 	struct mbuf *m;
 	struct mbuf *n = 0;
 	u_long len = 0, mbcnt = 0;
 
 	SOCKBUF_LOCK_ASSERT(sb);
 
 	for (m = sb->sb_mb; m; m = n) {
 	    n = m->m_nextpkt;
 	    for (; m; m = m->m_next) {
 		len += m->m_len;
 		mbcnt += MSIZE;
 		if (m->m_flags & M_EXT) /*XXX*/ /* pretty sure this is bogus */
 			mbcnt += m->m_ext.ext_size;
 	    }
 	}
 	if (len != sb->sb_cc || mbcnt != sb->sb_mbcnt) {
 		printf("cc %ld != %u || mbcnt %ld != %u\n", len, sb->sb_cc,
 		    mbcnt, sb->sb_mbcnt);
 		panic("sbcheck");
 	}
 }
 #endif
 
 /*
  * As above, except the mbuf chain begins a new record.
  */
 void
 sbappendrecord_locked(struct sockbuf *sb, struct mbuf *m0)
 {
 	struct mbuf *m;
 
 	SOCKBUF_LOCK_ASSERT(sb);
 
 	if (m0 == 0)
 		return;
 	m = sb->sb_mb;
 	if (m)
 		while (m->m_nextpkt)
 			m = m->m_nextpkt;
 	/*
 	 * Put the first mbuf on the queue.  Note this permits zero length
 	 * records.
 	 */
 	sballoc(sb, m0);
 	SBLASTRECORDCHK(sb);
 	SBLINKRECORD(sb, m0);
 	if (m)
 		m->m_nextpkt = m0;
 	else
 		sb->sb_mb = m0;
 	m = m0->m_next;
 	m0->m_next = 0;
 	if (m && (m0->m_flags & M_EOR)) {
 		m0->m_flags &= ~M_EOR;
 		m->m_flags |= M_EOR;
 	}
 	sbcompress(sb, m, m0);
 }
 
 /*
  * As above, except the mbuf chain begins a new record.
  */
 void
 sbappendrecord(struct sockbuf *sb, struct mbuf *m0)
 {
 
 	SOCKBUF_LOCK(sb);
 	sbappendrecord_locked(sb, m0);
 	SOCKBUF_UNLOCK(sb);
 }
 
 /*
  * Append address and data, and optionally, control (ancillary) data to the
  * receive queue of a socket.  If present, m0 must include a packet header
  * with total length.  Returns 0 if no space in sockbuf or insufficient
  * mbufs.
  */
 int
 sbappendaddr_locked(struct sockbuf *sb, const struct sockaddr *asa,
     struct mbuf *m0, struct mbuf *control)
 {
 	struct mbuf *m, *n, *nlast;
 	int space = asa->sa_len;
 
 	SOCKBUF_LOCK_ASSERT(sb);
 
 	if (m0 && (m0->m_flags & M_PKTHDR) == 0)
 		panic("sbappendaddr_locked");
 	if (m0)
 		space += m0->m_pkthdr.len;
 	space += m_length(control, &n);
 
 	if (space > sbspace(sb))
 		return (0);
 #if MSIZE <= 256
 	if (asa->sa_len > MLEN)
 		return (0);
 #endif
 	MGET(m, M_DONTWAIT, MT_SONAME);
 	if (m == 0)
 		return (0);
 	m->m_len = asa->sa_len;
 	bcopy(asa, mtod(m, caddr_t), asa->sa_len);
 	if (n)
 		n->m_next = m0;		/* concatenate data to control */
 	else
 		control = m0;
 	m->m_next = control;
 	for (n = m; n->m_next != NULL; n = n->m_next)
 		sballoc(sb, n);
 	sballoc(sb, n);
 	nlast = n;
 	SBLINKRECORD(sb, m);
 
 	sb->sb_mbtail = nlast;
 	SBLASTMBUFCHK(sb);
 
 	SBLASTRECORDCHK(sb);
 	return (1);
 }
 
 /*
  * Append address and data, and optionally, control (ancillary) data to the
  * receive queue of a socket.  If present, m0 must include a packet header
  * with total length.  Returns 0 if no space in sockbuf or insufficient
  * mbufs.
  */
 int
 sbappendaddr(struct sockbuf *sb, const struct sockaddr *asa,
     struct mbuf *m0, struct mbuf *control)
 {
 	int retval;
 
 	SOCKBUF_LOCK(sb);
 	retval = sbappendaddr_locked(sb, asa, m0, control);
 	SOCKBUF_UNLOCK(sb);
 	return (retval);
 }
 
 int
 sbappendcontrol_locked(struct sockbuf *sb, struct mbuf *m0,
     struct mbuf *control)
 {
 	struct mbuf *m, *n, *mlast;
 	int space;
 
 	SOCKBUF_LOCK_ASSERT(sb);
 
 	if (control == 0)
 		panic("sbappendcontrol_locked");
 	space = m_length(control, &n) + m_length(m0, NULL);
 
 	if (space > sbspace(sb))
 		return (0);
 	n->m_next = m0;			/* concatenate data to control */
 
 	SBLASTRECORDCHK(sb);
 
 	for (m = control; m->m_next; m = m->m_next)
 		sballoc(sb, m);
 	sballoc(sb, m);
 	mlast = m;
 	SBLINKRECORD(sb, control);
 
 	sb->sb_mbtail = mlast;
 	SBLASTMBUFCHK(sb);
 
 	SBLASTRECORDCHK(sb);
 	return (1);
 }
 
 int
 sbappendcontrol(struct sockbuf *sb, struct mbuf *m0, struct mbuf *control)
 {
 	int retval;
 
 	SOCKBUF_LOCK(sb);
 	retval = sbappendcontrol_locked(sb, m0, control);
 	SOCKBUF_UNLOCK(sb);
 	return (retval);
 }
 
 /*
  * Append the data in mbuf chain (m) into the socket buffer sb following mbuf
  * (n).  If (n) is NULL, the buffer is presumed empty.
  *
  * When the data is compressed, mbufs in the chain may be handled in one of
  * three ways:
  *
  * (1) The mbuf may simply be dropped, if it contributes nothing (no data, no
  *     record boundary, and no change in data type).
  *
  * (2) The mbuf may be coalesced -- i.e., data in the mbuf may be copied into
  *     an mbuf already in the socket buffer.  This can occur if an
  *     appropriate mbuf exists, there is room, and no merging of data types
  *     will occur.
  *
  * (3) The mbuf may be appended to the end of the existing mbuf chain.
  *
  * If any of the new mbufs is marked as M_EOR, mark the last mbuf appended as
  * end-of-record.
  */
 void
 sbcompress(struct sockbuf *sb, struct mbuf *m, struct mbuf *n)
 {
 	int eor = 0;
 	struct mbuf *o;
 
 	SOCKBUF_LOCK_ASSERT(sb);
 
 	while (m) {
 		eor |= m->m_flags & M_EOR;
 		if (m->m_len == 0 &&
 		    (eor == 0 ||
 		     (((o = m->m_next) || (o = n)) &&
 		      o->m_type == m->m_type))) {
 			if (sb->sb_lastrecord == m)
 				sb->sb_lastrecord = m->m_next;
 			m = m_free(m);
 			continue;
 		}
 		if (n && (n->m_flags & M_EOR) == 0 &&
 		    M_WRITABLE(n) &&
 		    m->m_len <= MCLBYTES / 4 && /* XXX: Don't copy too much */
 		    m->m_len <= M_TRAILINGSPACE(n) &&
 		    n->m_type == m->m_type) {
 			bcopy(mtod(m, caddr_t), mtod(n, caddr_t) + n->m_len,
 			    (unsigned)m->m_len);
 			n->m_len += m->m_len;
 			sb->sb_cc += m->m_len;
 			if (m->m_type != MT_DATA && m->m_type != MT_OOBDATA)
 				/* XXX: Probably don't need.*/
 				sb->sb_ctl += m->m_len;
 			m = m_free(m);
 			continue;
 		}
 		if (n)
 			n->m_next = m;
 		else
 			sb->sb_mb = m;
 		sb->sb_mbtail = m;
 		sballoc(sb, m);
 		n = m;
 		m->m_flags &= ~M_EOR;
 		m = m->m_next;
 		n->m_next = 0;
 	}
 	if (eor) {
 		KASSERT(n != NULL, ("sbcompress: eor && n == NULL"));
 		n->m_flags |= eor;
 	}
 	SBLASTMBUFCHK(sb);
 }
 
 /*
  * Free all mbufs in a sockbuf.  Check that all resources are reclaimed.
  */
 static void
 sbflush_internal(struct sockbuf *sb)
 {
 
 	if (sb->sb_flags & SB_LOCK)
 		panic("sbflush_internal: locked");
 	while (sb->sb_mbcnt) {
 		/*
 		 * Don't call sbdrop(sb, 0) if the leading mbuf is non-empty:
 		 * we would loop forever. Panic instead.
 		 */
 		if (!sb->sb_cc && (sb->sb_mb == NULL || sb->sb_mb->m_len))
 			break;
 		sbdrop_internal(sb, (int)sb->sb_cc);
 	}
 	if (sb->sb_cc || sb->sb_mb || sb->sb_mbcnt)
 		panic("sbflush_internal: cc %u || mb %p || mbcnt %u",
 		    sb->sb_cc, (void *)sb->sb_mb, sb->sb_mbcnt);
 }
 
 void
 sbflush_locked(struct sockbuf *sb)
 {
 
 	SOCKBUF_LOCK_ASSERT(sb);
 	sbflush_internal(sb);
 }
 
 void
 sbflush(struct sockbuf *sb)
 {
 
 	SOCKBUF_LOCK(sb);
 	sbflush_locked(sb);
 	SOCKBUF_UNLOCK(sb);
 }
 
 /*
  * Drop data from (the front of) a sockbuf.
  */
 static void
 sbdrop_internal(struct sockbuf *sb, int len)
 {
 	struct mbuf *m;
 	struct mbuf *next;
 
 	next = (m = sb->sb_mb) ? m->m_nextpkt : 0;
 	while (len > 0) {
 		if (m == 0) {
 			if (next == 0)
 				panic("sbdrop");
 			m = next;
 			next = m->m_nextpkt;
 			continue;
 		}
 		if (m->m_len > len) {
 			m->m_len -= len;
 			m->m_data += len;
 			sb->sb_cc -= len;
+			if (sb->sb_sndptroff != 0)
+				sb->sb_sndptroff -= len;
 			if (m->m_type != MT_DATA && m->m_type != MT_OOBDATA)
 				sb->sb_ctl -= len;
 			break;
 		}
 		len -= m->m_len;
 		sbfree(sb, m);
 		m = m_free(m);
 	}
 	while (m && m->m_len == 0) {
 		sbfree(sb, m);
 		m = m_free(m);
 	}
 	if (m) {
 		sb->sb_mb = m;
 		m->m_nextpkt = next;
 	} else
 		sb->sb_mb = next;
 	/*
 	 * First part is an inline SB_EMPTY_FIXUP().  Second part makes sure
 	 * sb_lastrecord is up-to-date if we dropped part of the last record.
 	 */
 	m = sb->sb_mb;
 	if (m == NULL) {
 		sb->sb_mbtail = NULL;
 		sb->sb_lastrecord = NULL;
 	} else if (m->m_nextpkt == NULL) {
 		sb->sb_lastrecord = m;
 	}
 }
 
 /*
  * Drop data from (the front of) a sockbuf.
  */
 void
 sbdrop_locked(struct sockbuf *sb, int len)
 {
 
 	SOCKBUF_LOCK_ASSERT(sb);
 
 	sbdrop_internal(sb, len);
 }
 
 void
 sbdrop(struct sockbuf *sb, int len)
 {
 
 	SOCKBUF_LOCK(sb);
 	sbdrop_locked(sb, len);
 	SOCKBUF_UNLOCK(sb);
+}
+
+
+/*
+ * Maintain a pointer and offset pair into the socket buffer mbuf chain to
+ * avoid traversal of the entire socket buffer for larger offsets.
+ */
+struct mbuf *
+sbsndptr(struct sockbuf *sb, u_int off, u_int len, u_int *moff)
+{
+	struct mbuf *m, *ret;
+
+	KASSERT(sb->sb_mb != NULL, ("%s: sb_mb is NULL", __func__));
+	KASSERT(off + len <= sb->sb_cc, ("%s: beyond sb", __func__));
+	KASSERT(sb->sb_sndptroff <= sb->sb_cc, ("%s: sndptroff broken", __func__));
+
+	/*
+	 * Is off below stored offset? Happens on retransmits.
+	 * Just return, we can't help here.
+	 */
+	if (sb->sb_sndptroff > off) {
+		*moff = off;
+		return (sb->sb_mb);
+	}
+
+	/* Return closest mbuf in chain for current offset. */
+	*moff = off - sb->sb_sndptroff;
+	m = ret = sb->sb_sndptr ? sb->sb_sndptr : sb->sb_mb;
+
+	/* Advance by len to be as close as possible for the next transmit. */
+	for (off = off - sb->sb_sndptroff + len - 1;
+	     off > 0 && off >= m->m_len;
+	     m = m->m_next) {
+		sb->sb_sndptroff += m->m_len;
+		off -= m->m_len;
+	}
+	sb->sb_sndptr = m;
+
+	return (ret);
 }
 
 /*
  * Drop a record off the front of a sockbuf and move the next record to the
  * front.
  */
 void
 sbdroprecord_locked(struct sockbuf *sb)
 {
 	struct mbuf *m;
 
 	SOCKBUF_LOCK_ASSERT(sb);
 
 	m = sb->sb_mb;
 	if (m) {
 		sb->sb_mb = m->m_nextpkt;
 		do {
 			sbfree(sb, m);
 			m = m_free(m);
 		} while (m);
 	}
 	SB_EMPTY_FIXUP(sb);
 }
 
 /*
  * Drop a record off the front of a sockbuf and move the next record to the
  * front.
  */
 void
 sbdroprecord(struct sockbuf *sb)
 {
 
 	SOCKBUF_LOCK(sb);
 	sbdroprecord_locked(sb);
 	SOCKBUF_UNLOCK(sb);
 }
 
 /* This takes the place of kern.maxsockbuf, which moved to kern.ipc. */
 static int dummy;
 SYSCTL_INT(_kern, KERN_DUMMY, dummy, CTLFLAG_RW, &dummy, 0, "");
 SYSCTL_OID(_kern_ipc, KIPC_MAXSOCKBUF, maxsockbuf, CTLTYPE_ULONG|CTLFLAG_RW,
     &sb_max, 0, sysctl_handle_sb_max, "LU", "Maximum socket buffer size");
 SYSCTL_ULONG(_kern_ipc, KIPC_SOCKBUF_WASTE, sockbuf_waste_factor, CTLFLAG_RW,
     &sb_efficiency, 0, "");
Index: head/sys/netinet/tcp_output.c
===================================================================
--- head/sys/netinet/tcp_output.c	(revision 167714)
+++ head/sys/netinet/tcp_output.c	(revision 167715)
@@ -1,1375 +1,1385 @@
 /*-
  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)tcp_output.c	8.4 (Berkeley) 5/24/95
  * $FreeBSD$
  */
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_ipsec.h"
 #include "opt_mac.h"
 #include "opt_tcpdebug.h"
 #include "opt_tcp_sack.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/domain.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mbuf.h>
 #include <sys/mutex.h>
 #include <sys/protosw.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
 
 #include <net/route.h>
 
 #include <netinet/in.h>
 #include <netinet/in_systm.h>
 #include <netinet/ip.h>
 #include <netinet/in_pcb.h>
 #include <netinet/ip_var.h>
 #include <netinet/ip_options.h>
 #ifdef INET6
 #include <netinet6/in6_pcb.h>
 #include <netinet/ip6.h>
 #include <netinet6/ip6_var.h>
 #endif
 #include <netinet/tcp.h>
 #define	TCPOUTFLAGS
 #include <netinet/tcp_fsm.h>
 #include <netinet/tcp_seq.h>
 #include <netinet/tcp_timer.h>
 #include <netinet/tcp_var.h>
 #include <netinet/tcpip.h>
 #ifdef TCPDEBUG
 #include <netinet/tcp_debug.h>
 #endif
 
 #ifdef IPSEC
 #include <netinet6/ipsec.h>
 #endif /*IPSEC*/
 
 #ifdef FAST_IPSEC
 #include <netipsec/ipsec.h>
 #define	IPSEC
 #endif /*FAST_IPSEC*/
 
 #include <machine/in_cksum.h>
 
 #include <security/mac/mac_framework.h>
 
 #ifdef notyet
 extern struct mbuf *m_copypack();
 #endif
 
 int path_mtu_discovery = 1;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, path_mtu_discovery, CTLFLAG_RW,
 	&path_mtu_discovery, 1, "Enable Path MTU Discovery");
 
 int ss_fltsz = 1;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, slowstart_flightsize, CTLFLAG_RW,
 	&ss_fltsz, 1, "Slow start flight size");
 
 int ss_fltsz_local = 4;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, local_slowstart_flightsize, CTLFLAG_RW,
 	&ss_fltsz_local, 1, "Slow start flight size for local networks");
 
 int     tcp_do_newreno = 1;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, newreno, CTLFLAG_RW, &tcp_do_newreno,
 	0, "Enable NewReno Algorithms");
 
 int	tcp_do_tso = 1;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, tso, CTLFLAG_RW,
 	&tcp_do_tso, 0, "Enable TCP Segmentation Offload");
 
 int	tcp_do_autosndbuf = 1;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_auto, CTLFLAG_RW,
 	&tcp_do_autosndbuf, 0, "Enable automatic send buffer sizing");
 
 int	tcp_autosndbuf_inc = 8*1024;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_inc, CTLFLAG_RW,
 	&tcp_autosndbuf_inc, 0, "Incrementor step size of automatic send buffer");
 
 int	tcp_autosndbuf_max = 256*1024;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_max, CTLFLAG_RW,
 	&tcp_autosndbuf_max, 0, "Max size of automatic send buffer");
 
 
 /*
  * Tcp output routine: figure out what should be sent and send it.
  */
 int
 tcp_output(struct tcpcb *tp)
 {
 	struct socket *so = tp->t_inpcb->inp_socket;
 	long len, recwin, sendwin;
 	int off, flags, error;
 #ifdef TCP_SIGNATURE
 	int sigoff = 0;
 #endif
 	struct mbuf *m;
 	struct ip *ip = NULL;
 	struct ipovly *ipov = NULL;
 	struct tcphdr *th;
 	u_char opt[TCP_MAXOLEN];
 	unsigned ipoptlen, optlen, hdrlen;
 	int idle, sendalot;
 	int sack_rxmit, sack_bytes_rxmt;
 	struct sackhole *p;
 	int tso = 0;
 	struct tcpopt to;
 #if 0
 	int maxburst = TCP_MAXBURST;
 #endif
 #ifdef INET6
 	struct ip6_hdr *ip6 = NULL;
 	int isipv6;
 
 	isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0;
 #endif
 
 	INP_LOCK_ASSERT(tp->t_inpcb);
 
 	/*
 	 * Determine length of data that should be transmitted,
 	 * and flags that will be used.
 	 * If there is some data or critical controls (SYN, RST)
 	 * to send, then transmit; otherwise, investigate further.
 	 */
 	idle = (tp->t_flags & TF_LASTIDLE) || (tp->snd_max == tp->snd_una);
 	if (idle && (ticks - tp->t_rcvtime) >= tp->t_rxtcur) {
 		/*
 		 * We have been idle for "a while" and no acks are
 		 * expected to clock out any data we send --
 		 * slow start to get ack "clock" running again.
 		 *
 		 * Set the slow-start flight size depending on whether
 		 * this is a local network or not.
 		 */
 		int ss = ss_fltsz;
 #ifdef INET6
 		if (isipv6) {
 			if (in6_localaddr(&tp->t_inpcb->in6p_faddr))
 				ss = ss_fltsz_local;
 		} else
 #endif /* INET6 */
 		if (in_localaddr(tp->t_inpcb->inp_faddr))
 			ss = ss_fltsz_local;
 		tp->snd_cwnd = tp->t_maxseg * ss;
 	}
 	tp->t_flags &= ~TF_LASTIDLE;
 	if (idle) {
 		if (tp->t_flags & TF_MORETOCOME) {
 			tp->t_flags |= TF_LASTIDLE;
 			idle = 0;
 		}
 	}
 again:
 	/*
 	 * If we've recently taken a timeout, snd_max will be greater than
 	 * snd_nxt.  There may be SACK information that allows us to avoid
 	 * resending already delivered data.  Adjust snd_nxt accordingly.
 	 */
 	if (tp->sack_enable && SEQ_LT(tp->snd_nxt, tp->snd_max))
 		tcp_sack_adjust(tp);
 	sendalot = 0;
 	off = tp->snd_nxt - tp->snd_una;
 	sendwin = min(tp->snd_wnd, tp->snd_cwnd);
 	sendwin = min(sendwin, tp->snd_bwnd);
 
 	flags = tcp_outflags[tp->t_state];
 	/*
 	 * Send any SACK-generated retransmissions.  If we're explicitly trying
 	 * to send out new data (when sendalot is 1), bypass this function.
 	 * If we retransmit in fast recovery mode, decrement snd_cwnd, since
 	 * we're replacing a (future) new transmission with a retransmission
 	 * now, and we previously incremented snd_cwnd in tcp_input().
 	 */
 	/*
 	 * Still in sack recovery , reset rxmit flag to zero.
 	 */
 	sack_rxmit = 0;
 	sack_bytes_rxmt = 0;
 	len = 0;
 	p = NULL;
 	if (tp->sack_enable && IN_FASTRECOVERY(tp) &&
 	    (p = tcp_sack_output(tp, &sack_bytes_rxmt))) {
 		long cwin;
 		
 		cwin = min(tp->snd_wnd, tp->snd_cwnd) - sack_bytes_rxmt;
 		if (cwin < 0)
 			cwin = 0;
 		/* Do not retransmit SACK segments beyond snd_recover */
 		if (SEQ_GT(p->end, tp->snd_recover)) {
 			/*
 			 * (At least) part of sack hole extends beyond
 			 * snd_recover. Check to see if we can rexmit data
 			 * for this hole.
 			 */
 			if (SEQ_GEQ(p->rxmit, tp->snd_recover)) {
 				/*
 				 * Can't rexmit any more data for this hole.
 				 * That data will be rexmitted in the next
 				 * sack recovery episode, when snd_recover
 				 * moves past p->rxmit.
 				 */
 				p = NULL;
 				goto after_sack_rexmit;
 			} else
 				/* Can rexmit part of the current hole */
 				len = ((long)ulmin(cwin,
 						   tp->snd_recover - p->rxmit));
 		} else
 			len = ((long)ulmin(cwin, p->end - p->rxmit));
 		off = p->rxmit - tp->snd_una;
 		KASSERT(off >= 0,("%s: sack block to the left of una : %d",
 		    __func__, off));
 		if (len > 0) {
 			sack_rxmit = 1;
 			sendalot = 1;
 			tcpstat.tcps_sack_rexmits++;
 			tcpstat.tcps_sack_rexmit_bytes +=
 			    min(len, tp->t_maxseg);
 		}
 	}
 after_sack_rexmit:
 	/*
 	 * Get standard flags, and add SYN or FIN if requested by 'hidden'
 	 * state flags.
 	 */
 	if (tp->t_flags & TF_NEEDFIN)
 		flags |= TH_FIN;
 	if (tp->t_flags & TF_NEEDSYN)
 		flags |= TH_SYN;
 
 	SOCKBUF_LOCK(&so->so_snd);
 	/*
 	 * If in persist timeout with window of 0, send 1 byte.
 	 * Otherwise, if window is small but nonzero
 	 * and timer expired, we will send what we can
 	 * and go to transmit state.
 	 */
 	if (tp->t_flags & TF_FORCEDATA) {
 		if (sendwin == 0) {
 			/*
 			 * If we still have some data to send, then
 			 * clear the FIN bit.  Usually this would
 			 * happen below when it realizes that we
 			 * aren't sending all the data.  However,
 			 * if we have exactly 1 byte of unsent data,
 			 * then it won't clear the FIN bit below,
 			 * and if we are in persist state, we wind
 			 * up sending the packet without recording
 			 * that we sent the FIN bit.
 			 *
 			 * We can't just blindly clear the FIN bit,
 			 * because if we don't have any more data
 			 * to send then the probe will be the FIN
 			 * itself.
 			 */
 			if (off < so->so_snd.sb_cc)
 				flags &= ~TH_FIN;
 			sendwin = 1;
 		} else {
 			callout_stop(tp->tt_persist);
 			tp->t_rxtshift = 0;
 		}
 	}
 
 	/*
 	 * If snd_nxt == snd_max and we have transmitted a FIN, the
 	 * offset will be > 0 even if so_snd.sb_cc is 0, resulting in
 	 * a negative length.  This can also occur when TCP opens up
 	 * its congestion window while receiving additional duplicate
 	 * acks after fast-retransmit because TCP will reset snd_nxt
 	 * to snd_max after the fast-retransmit.
 	 *
 	 * In the normal retransmit-FIN-only case, however, snd_nxt will
 	 * be set to snd_una, the offset will be 0, and the length may
 	 * wind up 0.
 	 *
 	 * If sack_rxmit is true we are retransmitting from the scoreboard
 	 * in which case len is already set.
 	 */
 	if (sack_rxmit == 0) {
 		if (sack_bytes_rxmt == 0)
 			len = ((long)ulmin(so->so_snd.sb_cc, sendwin) - off);
 		else {
 			long cwin;
 
                         /*
 			 * We are inside of a SACK recovery episode and are
 			 * sending new data, having retransmitted all the
 			 * data possible in the scoreboard.
 			 */
 			len = ((long)ulmin(so->so_snd.sb_cc, tp->snd_wnd) 
 			       - off);
 			/*
 			 * Don't remove this (len > 0) check !
 			 * We explicitly check for len > 0 here (although it 
 			 * isn't really necessary), to work around a gcc 
 			 * optimization issue - to force gcc to compute
 			 * len above. Without this check, the computation
 			 * of len is bungled by the optimizer.
 			 */
 			if (len > 0) {
 				cwin = tp->snd_cwnd - 
 					(tp->snd_nxt - tp->sack_newdata) -
 					sack_bytes_rxmt;
 				if (cwin < 0)
 					cwin = 0;
 				len = lmin(len, cwin);
 			}
 		}
 	}
 
 	/*
 	 * Lop off SYN bit if it has already been sent.  However, if this
 	 * is SYN-SENT state and if segment contains data and if we don't
 	 * know that foreign host supports TAO, suppress sending segment.
 	 */
 	if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una)) {
 		if (tp->t_state != TCPS_SYN_RECEIVED)
 			flags &= ~TH_SYN;
 		off--, len++;
 	}
 
 	/*
 	 * Be careful not to send data and/or FIN on SYN segments.
 	 * This measure is needed to prevent interoperability problems
 	 * with not fully conformant TCP implementations.
 	 */
 	if ((flags & TH_SYN) && (tp->t_flags & TF_NOOPT)) {
 		len = 0;
 		flags &= ~TH_FIN;
 	}
 
 	if (len < 0) {
 		/*
 		 * If FIN has been sent but not acked,
 		 * but we haven't been called to retransmit,
 		 * len will be < 0.  Otherwise, window shrank
 		 * after we sent into it.  If window shrank to 0,
 		 * cancel pending retransmit, pull snd_nxt back
 		 * to (closed) window, and set the persist timer
 		 * if it isn't already going.  If the window didn't
 		 * close completely, just wait for an ACK.
 		 */
 		len = 0;
 		if (sendwin == 0) {
 			callout_stop(tp->tt_rexmt);
 			tp->t_rxtshift = 0;
 			tp->snd_nxt = tp->snd_una;
 			if (!callout_active(tp->tt_persist))
 				tcp_setpersist(tp);
 		}
 	}
 
 	/* len will be >= 0 after this point. */
 	KASSERT(len >= 0, ("%s: len < 0", __func__));
 
 	/*
 	 * Automatic sizing of send socket buffer.  Often the send buffer
 	 * size is not optimally adjusted to the actual network conditions
 	 * at hand (delay bandwidth product).  Setting the buffer size too
 	 * small limits throughput on links with high bandwidth and high
 	 * delay (eg. trans-continental/oceanic links).  Setting the
 	 * buffer size too big consumes too much real kernel memory,
 	 * especially with many connections on busy servers.
 	 *
 	 * The criteria to step up the send buffer one notch are:
 	 *  1. receive window of remote host is larger than send buffer
 	 *     (with a fudge factor of 5/4th);
 	 *  2. send buffer is filled to 7/8th with data (so we actually
 	 *     have data to make use of it);
 	 *  3. send buffer fill has not hit maximal automatic size;
 	 *  4. our send window (slow start and cogestion controlled) is
 	 *     larger than sent but unacknowledged data in send buffer.
 	 *
 	 * The remote host receive window scaling factor may limit the
 	 * growing of the send buffer before it reaches its allowed
 	 * maximum.
 	 *
 	 * It scales directly with slow start or congestion window
 	 * and does at most one step per received ACK.  This fast
 	 * scaling has the drawback of growing the send buffer beyond
 	 * what is strictly necessary to make full use of a given
 	 * delay*bandwith product.  However testing has shown this not
 	 * to be much of an problem.  At worst we are trading wasting
 	 * of available bandwith (the non-use of it) for wasting some
 	 * socket buffer memory.
 	 *
 	 * TODO: Shrink send buffer during idle periods together
 	 * with congestion window.  Requires another timer.  Has to
 	 * wait for upcoming tcp timer rewrite.
 	 */
 	if (tcp_do_autosndbuf && so->so_snd.sb_flags & SB_AUTOSIZE) {
 		if ((tp->snd_wnd / 4 * 5) >= so->so_snd.sb_hiwat &&
 		    so->so_snd.sb_cc >= (so->so_snd.sb_hiwat / 8 * 7) &&
 		    so->so_snd.sb_cc < tcp_autosndbuf_max &&
 		    sendwin >= (so->so_snd.sb_cc - (tp->snd_nxt - tp->snd_una))) {
 			if (!sbreserve_locked(&so->so_snd,
 			    min(so->so_snd.sb_hiwat + tcp_autosndbuf_inc,
 			     tcp_autosndbuf_max), so, curthread))
 				so->so_snd.sb_flags &= ~SB_AUTOSIZE;
 		}
 	}
 
 	/*
 	 * Truncate to the maximum segment length or enable TCP Segmentation
 	 * Offloading (if supported by hardware) and ensure that FIN is removed
 	 * if the length no longer contains the last data byte.
 	 *
 	 * TSO may only be used if we are in a pure bulk sending state.  The
 	 * presence of TCP-MD5, SACK retransmits, SACK advertizements and
 	 * IP options prevent using TSO.  With TSO the TCP header is the same
 	 * (except for the sequence number) for all generated packets.  This
 	 * makes it impossible to transmit any options which vary per generated
 	 * segment or packet.
 	 *
 	 * The length of TSO bursts is limited to TCP_MAXWIN.  That limit and
 	 * removal of FIN (if not already catched here) are handled later after
 	 * the exact length of the TCP options are known.
 	 */
 	if (len > tp->t_maxseg) {
 		if ((tp->t_flags & TF_TSO) && tcp_do_tso &&
 		    ((tp->t_flags & TF_SIGNATURE) == 0) &&
 		    tp->rcv_numsacks == 0 && sack_rxmit == 0 &&
 		    tp->t_inpcb->inp_options == NULL &&
 		    tp->t_inpcb->in6p_options == NULL &&
 		    tp->t_inpcb->inp_sp == NULL) {
 			tso = 1;
 		} else {
 			len = tp->t_maxseg;
 			sendalot = 1;
 			tso = 0;
 		}
 	}
 	if (sack_rxmit) {
 		if (SEQ_LT(p->rxmit + len, tp->snd_una + so->so_snd.sb_cc))
 			flags &= ~TH_FIN;
 	} else {
 		if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + so->so_snd.sb_cc))
 			flags &= ~TH_FIN;
 	}
 
 	recwin = sbspace(&so->so_rcv);
 
 	/*
 	 * Sender silly window avoidance.   We transmit under the following
 	 * conditions when len is non-zero:
 	 *
 	 *	- We have a full segment (or more with TSO)
 	 *	- This is the last buffer in a write()/send() and we are
 	 *	  either idle or running NODELAY
 	 *	- we've timed out (e.g. persist timer)
 	 *	- we have more then 1/2 the maximum send window's worth of
 	 *	  data (receiver may be limited the window size)
 	 *	- we need to retransmit
 	 */
 	if (len) {
 		if (len >= tp->t_maxseg)
 			goto send;
 		/*
 		 * NOTE! on localhost connections an 'ack' from the remote
 		 * end may occur synchronously with the output and cause
 		 * us to flush a buffer queued with moretocome.  XXX
 		 *
 		 * note: the len + off check is almost certainly unnecessary.
 		 */
 		if (!(tp->t_flags & TF_MORETOCOME) &&	/* normal case */
 		    (idle || (tp->t_flags & TF_NODELAY)) &&
 		    len + off >= so->so_snd.sb_cc &&
 		    (tp->t_flags & TF_NOPUSH) == 0) {
 			goto send;
 		}
 		if (tp->t_flags & TF_FORCEDATA)		/* typ. timeout case */
 			goto send;
 		if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0)
 			goto send;
 		if (SEQ_LT(tp->snd_nxt, tp->snd_max))	/* retransmit case */
 			goto send;
 		if (sack_rxmit)
 			goto send;
 	}
 
 	/*
 	 * Compare available window to amount of window
 	 * known to peer (as advertised window less
 	 * next expected input).  If the difference is at least two
 	 * max size segments, or at least 50% of the maximum possible
 	 * window, then want to send a window update to peer.
 	 * Skip this if the connection is in T/TCP half-open state.
 	 */
 	if (recwin > 0 && !(tp->t_flags & TF_NEEDSYN)) {
 		/*
 		 * "adv" is the amount we can increase the window,
 		 * taking into account that we are limited by
 		 * TCP_MAXWIN << tp->rcv_scale.
 		 */
 		long adv = min(recwin, (long)TCP_MAXWIN << tp->rcv_scale) -
 			(tp->rcv_adv - tp->rcv_nxt);
 
 		if (adv >= (long) (2 * tp->t_maxseg))
 			goto send;
 		if (2 * adv >= (long) so->so_rcv.sb_hiwat)
 			goto send;
 	}
 
 	/*
 	 * Send if we owe the peer an ACK, RST, SYN, or urgent data.  ACKNOW
 	 * is also a catch-all for the retransmit timer timeout case.
 	 */
 	if (tp->t_flags & TF_ACKNOW)
 		goto send;
 	if ((flags & TH_RST) ||
 	    ((flags & TH_SYN) && (tp->t_flags & TF_NEEDSYN) == 0))
 		goto send;
 	if (SEQ_GT(tp->snd_up, tp->snd_una))
 		goto send;
 	/*
 	 * If our state indicates that FIN should be sent
 	 * and we have not yet done so, then we need to send.
 	 */
 	if (flags & TH_FIN &&
 	    ((tp->t_flags & TF_SENTFIN) == 0 || tp->snd_nxt == tp->snd_una))
 		goto send;
 	/*
 	 * In SACK, it is possible for tcp_output to fail to send a segment
 	 * after the retransmission timer has been turned off.  Make sure
 	 * that the retransmission timer is set.
 	 */
 	if (tp->sack_enable && SEQ_GT(tp->snd_max, tp->snd_una) &&
 	    !callout_active(tp->tt_rexmt) &&
 	    !callout_active(tp->tt_persist)) {
 		callout_reset(tp->tt_rexmt, tp->t_rxtcur,
 			      tcp_timer_rexmt, tp);
 		goto just_return;
 	} 
 	/*
 	 * TCP window updates are not reliable, rather a polling protocol
 	 * using ``persist'' packets is used to insure receipt of window
 	 * updates.  The three ``states'' for the output side are:
 	 *	idle			not doing retransmits or persists
 	 *	persisting		to move a small or zero window
 	 *	(re)transmitting	and thereby not persisting
 	 *
 	 * callout_active(tp->tt_persist)
 	 *	is true when we are in persist state.
 	 * (tp->t_flags & TF_FORCEDATA)
 	 *	is set when we are called to send a persist packet.
 	 * callout_active(tp->tt_rexmt)
 	 *	is set when we are retransmitting
 	 * The output side is idle when both timers are zero.
 	 *
 	 * If send window is too small, there is data to transmit, and no
 	 * retransmit or persist is pending, then go to persist state.
 	 * If nothing happens soon, send when timer expires:
 	 * if window is nonzero, transmit what we can,
 	 * otherwise force out a byte.
 	 */
 	if (so->so_snd.sb_cc && !callout_active(tp->tt_rexmt) &&
 	    !callout_active(tp->tt_persist)) {
 		tp->t_rxtshift = 0;
 		tcp_setpersist(tp);
 	}
 
 	/*
 	 * No reason to send a segment, just return.
 	 */
 just_return:
 	SOCKBUF_UNLOCK(&so->so_snd);
 	return (0);
 
 send:
 	SOCKBUF_LOCK_ASSERT(&so->so_snd);
 	/*
 	 * Before ESTABLISHED, force sending of initial options
 	 * unless TCP set not to do any options.
 	 * NOTE: we assume that the IP/TCP header plus TCP options
 	 * always fit in a single mbuf, leaving room for a maximum
 	 * link header, i.e.
 	 *	max_linkhdr + sizeof (struct tcpiphdr) + optlen <= MCLBYTES
 	 */
 	optlen = 0;
 #ifdef INET6
 	if (isipv6)
 		hdrlen = sizeof (struct ip6_hdr) + sizeof (struct tcphdr);
 	else
 #endif
 	hdrlen = sizeof (struct tcpiphdr);
 
 	/*
 	 * Compute options for segment.
 	 * We only have to care about SYN and established connection
 	 * segments.  Options for SYN-ACK segments are handled in TCP
 	 * syncache.
 	 */
 	if ((tp->t_flags & TF_NOOPT) == 0) {
 		to.to_flags = 0;
 		/* Maximum segment size. */
 		if (flags & TH_SYN) {
 			tp->snd_nxt = tp->iss;
 			to.to_mss = tcp_mssopt(&tp->t_inpcb->inp_inc);
 			to.to_flags |= TOF_MSS;
 		}
 		/* Window scaling. */
 		if ((flags & TH_SYN) && (tp->t_flags & TF_REQ_SCALE)) {
 			to.to_wscale = tp->request_r_scale;
 			to.to_flags |= TOF_SCALE;
 		}
 		/* Timestamps. */
 		if ((tp->t_flags & TF_RCVD_TSTMP) ||
 		    ((flags & TH_SYN) && (tp->t_flags & TF_REQ_TSTMP))) {
 			to.to_tsval = ticks + tp->ts_offset;
 			to.to_tsecr = tp->ts_recent;
 			to.to_flags |= TOF_TS;
 			/* Set receive buffer autosizing timestamp. */
 			if (tp->rfbuf_ts == 0 &&
 			    (so->so_rcv.sb_flags & SB_AUTOSIZE))
 				tp->rfbuf_ts = ticks;
 		}
 		/* Selective ACK's. */
 		if (tp->sack_enable) {
 			if (flags & TH_SYN)
 				to.to_flags |= TOF_SACKPERM;
 			else if (TCPS_HAVEESTABLISHED(tp->t_state) &&
 			    (tp->t_flags & TF_SACK_PERMIT) &&
 			    tp->rcv_numsacks > 0) {
 				to.to_flags |= TOF_SACK;
 				to.to_nsacks = tp->rcv_numsacks;
 				to.to_sacks = (u_char *)tp->sackblks;
 			}
 		}
 #ifdef TCP_SIGNATURE
 		/* TCP-MD5 (RFC2385). */
 #ifdef INET6
 		if (!isipv6 && (tp->t_flags & TF_SIGNATURE))
 #else
 		if (tp->t_flags & TF_SIGNATURE)
 #endif /* INET6 */
 			to.to_flags |= TOF_SIGNATURE;
 #endif /* TCP_SIGNATURE */
 
 		/* Processing the options. */
 		hdrlen += optlen = tcp_addoptions(&to, (u_char *)&opt);
 
 #ifdef TCP_SIGNATURE
 		sigoff = to.to_signature - (u_char *)&to;
 #endif /* TCP_SIGNATURE */
 	}
 
 #ifdef INET6
 	if (isipv6)
 		ipoptlen = ip6_optlen(tp->t_inpcb);
 	else
 #endif
 	if (tp->t_inpcb->inp_options)
 		ipoptlen = tp->t_inpcb->inp_options->m_len -
 				offsetof(struct ipoption, ipopt_list);
 	else
 		ipoptlen = 0;
 #ifdef IPSEC
 	ipoptlen += ipsec_hdrsiz_tcp(tp);
 #endif
 
 	/*
 	 * Adjust data length if insertion of options will
 	 * bump the packet length beyond the t_maxopd length.
 	 * Clear the FIN bit because we cut off the tail of
 	 * the segment.
 	 *
 	 * When doing TSO limit a burst to TCP_MAXWIN minus the
 	 * IP, TCP and Options length to keep ip->ip_len from
 	 * overflowing.  Prevent the last segment from being
 	 * fractional thus making them all equal sized and set
 	 * the flag to continue sending.  TSO is disabled when
 	 * IP options or IPSEC are present.
 	 */
 	if (len + optlen + ipoptlen > tp->t_maxopd) {
 		flags &= ~TH_FIN;
 		if (tso) {
 			if (len > TCP_MAXWIN - hdrlen) {
 				len = TCP_MAXWIN - hdrlen - optlen;
 				len = len - (len % (tp->t_maxopd - optlen));
 				sendalot = 1;
 			} else if (tp->t_flags & TF_NEEDFIN)
 				sendalot = 1;
 		} else {
 			len = tp->t_maxopd - optlen - ipoptlen;
 			sendalot = 1;
 		}
 	}
 
 /*#ifdef DIAGNOSTIC*/
 #ifdef INET6
 	if (max_linkhdr + hdrlen > MCLBYTES)
 #else
 	if (max_linkhdr + hdrlen > MHLEN)
 #endif
 		panic("tcphdr too big");
 /*#endif*/
 
 	/*
 	 * Grab a header mbuf, attaching a copy of data to
 	 * be transmitted, and initialize the header from
 	 * the template for sends on this connection.
 	 */
 	if (len) {
+		struct mbuf *mb;
+		u_int moff;
+
 		if ((tp->t_flags & TF_FORCEDATA) && len == 1)
 			tcpstat.tcps_sndprobe++;
 		else if (SEQ_LT(tp->snd_nxt, tp->snd_max)) {
 			tcpstat.tcps_sndrexmitpack++;
 			tcpstat.tcps_sndrexmitbyte += len;
 		} else {
 			tcpstat.tcps_sndpack++;
 			tcpstat.tcps_sndbyte += len;
 		}
 #ifdef notyet
 		if ((m = m_copypack(so->so_snd.sb_mb, off,
 		    (int)len, max_linkhdr + hdrlen)) == 0) {
 			SOCKBUF_UNLOCK(&so->so_snd);
 			error = ENOBUFS;
 			goto out;
 		}
 		/*
 		 * m_copypack left space for our hdr; use it.
 		 */
 		m->m_len += hdrlen;
 		m->m_data -= hdrlen;
 #else
 		MGETHDR(m, M_DONTWAIT, MT_DATA);
 		if (m == NULL) {
 			SOCKBUF_UNLOCK(&so->so_snd);
 			error = ENOBUFS;
 			goto out;
 		}
 #ifdef INET6
 		if (MHLEN < hdrlen + max_linkhdr) {
 			MCLGET(m, M_DONTWAIT);
 			if ((m->m_flags & M_EXT) == 0) {
 				SOCKBUF_UNLOCK(&so->so_snd);
 				m_freem(m);
 				error = ENOBUFS;
 				goto out;
 			}
 		}
 #endif
 		m->m_data += max_linkhdr;
 		m->m_len = hdrlen;
+
+		/*
+		 * Start the m_copy functions from the closest mbuf
+		 * to the offset in the socket buffer chain.
+		 */
+		mb = sbsndptr(&so->so_snd, off, len, &moff);
+
 		if (len <= MHLEN - hdrlen - max_linkhdr) {
-			m_copydata(so->so_snd.sb_mb, off, (int)len,
+			m_copydata(mb, moff, (int)len,
 			    mtod(m, caddr_t) + hdrlen);
 			m->m_len += len;
 		} else {
-			m->m_next = m_copy(so->so_snd.sb_mb, off, (int)len);
-			if (m->m_next == 0) {
+			m->m_next = m_copy(mb, moff, (int)len);
+			if (m->m_next == NULL) {
 				SOCKBUF_UNLOCK(&so->so_snd);
 				(void) m_free(m);
 				error = ENOBUFS;
 				goto out;
 			}
 		}
 #endif
 		/*
 		 * If we're sending everything we've got, set PUSH.
 		 * (This will keep happy those implementations which only
 		 * give data to the user when a buffer fills or
 		 * a PUSH comes in.)
 		 */
 		if (off + len == so->so_snd.sb_cc)
 			flags |= TH_PUSH;
 		SOCKBUF_UNLOCK(&so->so_snd);
 	} else {
 		SOCKBUF_UNLOCK(&so->so_snd);
 		if (tp->t_flags & TF_ACKNOW)
 			tcpstat.tcps_sndacks++;
 		else if (flags & (TH_SYN|TH_FIN|TH_RST))
 			tcpstat.tcps_sndctrl++;
 		else if (SEQ_GT(tp->snd_up, tp->snd_una))
 			tcpstat.tcps_sndurg++;
 		else
 			tcpstat.tcps_sndwinup++;
 
 		MGETHDR(m, M_DONTWAIT, MT_DATA);
 		if (m == NULL) {
 			error = ENOBUFS;
 			goto out;
 		}
 #ifdef INET6
 		if (isipv6 && (MHLEN < hdrlen + max_linkhdr) &&
 		    MHLEN >= hdrlen) {
 			MH_ALIGN(m, hdrlen);
 		} else
 #endif
 		m->m_data += max_linkhdr;
 		m->m_len = hdrlen;
 	}
 	SOCKBUF_UNLOCK_ASSERT(&so->so_snd);
 	m->m_pkthdr.rcvif = (struct ifnet *)0;
 #ifdef MAC
 	mac_create_mbuf_from_inpcb(tp->t_inpcb, m);
 #endif
 #ifdef INET6
 	if (isipv6) {
 		ip6 = mtod(m, struct ip6_hdr *);
 		th = (struct tcphdr *)(ip6 + 1);
 		tcpip_fillheaders(tp->t_inpcb, ip6, th);
 	} else
 #endif /* INET6 */
 	{
 		ip = mtod(m, struct ip *);
 		ipov = (struct ipovly *)ip;
 		th = (struct tcphdr *)(ip + 1);
 		tcpip_fillheaders(tp->t_inpcb, ip, th);
 	}
 
 	/*
 	 * Fill in fields, remembering maximum advertised
 	 * window for use in delaying messages about window sizes.
 	 * If resending a FIN, be sure not to use a new sequence number.
 	 */
 	if (flags & TH_FIN && tp->t_flags & TF_SENTFIN &&
 	    tp->snd_nxt == tp->snd_max)
 		tp->snd_nxt--;
 	/*
 	 * If we are doing retransmissions, then snd_nxt will
 	 * not reflect the first unsent octet.  For ACK only
 	 * packets, we do not want the sequence number of the
 	 * retransmitted packet, we want the sequence number
 	 * of the next unsent octet.  So, if there is no data
 	 * (and no SYN or FIN), use snd_max instead of snd_nxt
 	 * when filling in ti_seq.  But if we are in persist
 	 * state, snd_max might reflect one byte beyond the
 	 * right edge of the window, so use snd_nxt in that
 	 * case, since we know we aren't doing a retransmission.
 	 * (retransmit and persist are mutually exclusive...)
 	 */
 	if (sack_rxmit == 0) {
 		if (len || (flags & (TH_SYN|TH_FIN))
 		    || callout_active(tp->tt_persist))
 			th->th_seq = htonl(tp->snd_nxt);
 		else
 			th->th_seq = htonl(tp->snd_max);
 	} else {
 		th->th_seq = htonl(p->rxmit);
 		p->rxmit += len;
 		tp->sackhint.sack_bytes_rexmit += len;
 	}
 	th->th_ack = htonl(tp->rcv_nxt);
 	if (optlen) {
 		bcopy(opt, th + 1, optlen);
 		th->th_off = (sizeof (struct tcphdr) + optlen) >> 2;
 	}
 	th->th_flags = flags;
 	/*
 	 * Calculate receive window.  Don't shrink window,
 	 * but avoid silly window syndrome.
 	 *
 	 * XXX: RFC1323:  The Window field in a SYN (i.e., a <SYN> or
 	 * <SYN,ACK>) segment itself is never scaled.
 	 */
 	if (recwin < (long)(so->so_rcv.sb_hiwat / 4) &&
 	    recwin < (long)tp->t_maxseg)
 		recwin = 0;
 	if (recwin < (long)(tp->rcv_adv - tp->rcv_nxt))
 		recwin = (long)(tp->rcv_adv - tp->rcv_nxt);
 	if (recwin > (long)TCP_MAXWIN << tp->rcv_scale)
 		recwin = (long)TCP_MAXWIN << tp->rcv_scale;
 	th->th_win = htons((u_short) (recwin >> tp->rcv_scale));
 
 
 	/*
 	 * Adjust the RXWIN0SENT flag - indicate that we have advertised
 	 * a 0 window.  This may cause the remote transmitter to stall.  This
 	 * flag tells soreceive() to disable delayed acknowledgements when
 	 * draining the buffer.  This can occur if the receiver is attempting
 	 * to read more data then can be buffered prior to transmitting on
 	 * the connection.
 	 */
 	if (recwin == 0)
 		tp->t_flags |= TF_RXWIN0SENT;
 	else
 		tp->t_flags &= ~TF_RXWIN0SENT;
 	if (SEQ_GT(tp->snd_up, tp->snd_nxt)) {
 		th->th_urp = htons((u_short)(tp->snd_up - tp->snd_nxt));
 		th->th_flags |= TH_URG;
 	} else
 		/*
 		 * If no urgent pointer to send, then we pull
 		 * the urgent pointer to the left edge of the send window
 		 * so that it doesn't drift into the send window on sequence
 		 * number wraparound.
 		 */
 		tp->snd_up = tp->snd_una;		/* drag it along */
 
 #ifdef TCP_SIGNATURE
 #ifdef INET6
 	if (!isipv6)
 #endif
 	if (tp->t_flags & TF_SIGNATURE)
 		tcp_signature_compute(m, sizeof(struct ip), len, optlen,
 		    (u_char *)(th + 1) + sigoff, IPSEC_DIR_OUTBOUND);
 #endif
 
 	/*
 	 * Put TCP length in extended header, and then
 	 * checksum extended header and data.
 	 */
 	m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */
 #ifdef INET6
 	if (isipv6)
 		/*
 		 * ip6_plen is not need to be filled now, and will be filled
 		 * in ip6_output.
 		 */
 		th->th_sum = in6_cksum(m, IPPROTO_TCP, sizeof(struct ip6_hdr),
 				       sizeof(struct tcphdr) + optlen + len);
 	else
 #endif /* INET6 */
 	{
 		m->m_pkthdr.csum_flags = CSUM_TCP;
 		m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
 		th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
 		    htons(sizeof(struct tcphdr) + IPPROTO_TCP + len + optlen));
 
 		/* IP version must be set here for ipv4/ipv6 checking later */
 		KASSERT(ip->ip_v == IPVERSION,
 		    ("%s: IP version incorrect: %d", __func__, ip->ip_v));
 	}
 
 	/*
 	 * Enable TSO and specify the size of the segments.
 	 * The TCP pseudo header checksum is always provided.
 	 * XXX: Fixme: This is currently not the case for IPv6.
 	 */
 	if (tso) {
 		m->m_pkthdr.csum_flags = CSUM_TSO;
 		m->m_pkthdr.tso_segsz = tp->t_maxopd - optlen;
 	}
 
 	/*
 	 * In transmit state, time the transmission and arrange for
 	 * the retransmit.  In persist state, just set snd_max.
 	 */
 	if ((tp->t_flags & TF_FORCEDATA) == 0 || 
 	    !callout_active(tp->tt_persist)) {
 		tcp_seq startseq = tp->snd_nxt;
 
 		/*
 		 * Advance snd_nxt over sequence space of this segment.
 		 */
 		if (flags & (TH_SYN|TH_FIN)) {
 			if (flags & TH_SYN)
 				tp->snd_nxt++;
 			if (flags & TH_FIN) {
 				tp->snd_nxt++;
 				tp->t_flags |= TF_SENTFIN;
 			}
 		}
 		if (sack_rxmit)
 			goto timer;
 		tp->snd_nxt += len;
 		if (SEQ_GT(tp->snd_nxt, tp->snd_max)) {
 			tp->snd_max = tp->snd_nxt;
 			/*
 			 * Time this transmission if not a retransmission and
 			 * not currently timing anything.
 			 */
 			if (tp->t_rtttime == 0) {
 				tp->t_rtttime = ticks;
 				tp->t_rtseq = startseq;
 				tcpstat.tcps_segstimed++;
 			}
 		}
 
 		/*
 		 * Set retransmit timer if not currently set,
 		 * and not doing a pure ack or a keep-alive probe.
 		 * Initial value for retransmit timer is smoothed
 		 * round-trip time + 2 * round-trip time variance.
 		 * Initialize shift counter which is used for backoff
 		 * of retransmit time.
 		 */
 timer:
 		if (!callout_active(tp->tt_rexmt) &&
 		    ((sack_rxmit && tp->snd_nxt != tp->snd_max) ||
 		     (tp->snd_nxt != tp->snd_una))) {
 			if (callout_active(tp->tt_persist)) {
 				callout_stop(tp->tt_persist);
 				tp->t_rxtshift = 0;
 			}
 			callout_reset(tp->tt_rexmt, tp->t_rxtcur,
 				      tcp_timer_rexmt, tp);
 		}
 	} else {
 		/*
 		 * Persist case, update snd_max but since we are in
 		 * persist mode (no window) we do not update snd_nxt.
 		 */
 		int xlen = len;
 		if (flags & TH_SYN)
 			++xlen;
 		if (flags & TH_FIN) {
 			++xlen;
 			tp->t_flags |= TF_SENTFIN;
 		}
 		if (SEQ_GT(tp->snd_nxt + xlen, tp->snd_max))
 			tp->snd_max = tp->snd_nxt + len;
 	}
 
 #ifdef TCPDEBUG
 	/*
 	 * Trace.
 	 */
 	if (so->so_options & SO_DEBUG) {
 		u_short save = 0;
 #ifdef INET6
 		if (!isipv6)
 #endif
 		{
 			save = ipov->ih_len;
 			ipov->ih_len = htons(m->m_pkthdr.len /* - hdrlen + (th->th_off << 2) */);
 		}
 		tcp_trace(TA_OUTPUT, tp->t_state, tp, mtod(m, void *), th, 0);
 #ifdef INET6
 		if (!isipv6)
 #endif
 		ipov->ih_len = save;
 	}
 #endif
 
 	/*
 	 * Fill in IP length and desired time to live and
 	 * send to IP level.  There should be a better way
 	 * to handle ttl and tos; we could keep them in
 	 * the template, but need a way to checksum without them.
 	 */
 	/*
 	 * m->m_pkthdr.len should have been set before cksum calcuration,
 	 * because in6_cksum() need it.
 	 */
 #ifdef INET6
 	if (isipv6) {
 		/*
 		 * we separately set hoplimit for every segment, since the
 		 * user might want to change the value via setsockopt.
 		 * Also, desired default hop limit might be changed via
 		 * Neighbor Discovery.
 		 */
 		ip6->ip6_hlim = in6_selecthlim(tp->t_inpcb, NULL);
 
 		/* TODO: IPv6 IP6TOS_ECT bit on */
 		error = ip6_output(m,
 			    tp->t_inpcb->in6p_outputopts, NULL,
 			    ((so->so_options & SO_DONTROUTE) ?
 			    IP_ROUTETOIF : 0), NULL, NULL, tp->t_inpcb);
 	} else
 #endif /* INET6 */
     {
 	ip->ip_len = m->m_pkthdr.len;
 #ifdef INET6
 	if (INP_CHECK_SOCKAF(so, AF_INET6))
 		ip->ip_ttl = in6_selecthlim(tp->t_inpcb, NULL);
 #endif /* INET6 */
 	/*
 	 * If we do path MTU discovery, then we set DF on every packet.
 	 * This might not be the best thing to do according to RFC3390
 	 * Section 2. However the tcp hostcache migitates the problem
 	 * so it affects only the first tcp connection with a host.
 	 */
 	if (path_mtu_discovery)
 		ip->ip_off |= IP_DF;
 
 	error = ip_output(m, tp->t_inpcb->inp_options, NULL,
 	    ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0), 0,
 	    tp->t_inpcb);
     }
 	if (error) {
 
 		/*
 		 * We know that the packet was lost, so back out the
 		 * sequence number advance, if any.
 		 *
 		 * If the error is EPERM the packet got blocked by the
 		 * local firewall.  Normally we should terminate the
 		 * connection but the blocking may have been spurious
 		 * due to a firewall reconfiguration cycle.  So we treat
 		 * it like a packet loss and let the retransmit timer and
 		 * timeouts do their work over time.
 		 * XXX: It is a POLA question whether calling tcp_drop right
 		 * away would be the really correct behavior instead.
 		 */
 		if (((tp->t_flags & TF_FORCEDATA) == 0 ||
 		    !callout_active(tp->tt_persist)) &&
 		    ((flags & TH_SYN) == 0) &&
 		    (error != EPERM)) {
 			if (sack_rxmit) {
 				p->rxmit -= len;
 				tp->sackhint.sack_bytes_rexmit -= len;
 				KASSERT(tp->sackhint.sack_bytes_rexmit >= 0,
 				    ("sackhint bytes rtx >= 0"));
 			} else
 				tp->snd_nxt -= len;
 		}
 out:
 		SOCKBUF_UNLOCK_ASSERT(&so->so_snd);	/* Check gotos. */
 		switch (error) {
 		case EPERM:
 			tp->t_softerror = error;
 			return (error);
 		case ENOBUFS:
 	                if (!callout_active(tp->tt_rexmt) &&
 			    !callout_active(tp->tt_persist))
 	                        callout_reset(tp->tt_rexmt, tp->t_rxtcur,
 				    tcp_timer_rexmt, tp);
 			tp->snd_cwnd = tp->t_maxseg;
 			return (0);
 		case EMSGSIZE:
 			/*
 			 * For some reason the interface we used initially
 			 * to send segments changed to another or lowered
 			 * its MTU.
 			 *
 			 * tcp_mtudisc() will find out the new MTU and as
 			 * its last action, initiate retransmission, so it
 			 * is important to not do so here.
 			 *
 			 * If TSO was active we either got an interface
 			 * without TSO capabilits or TSO was turned off.
 			 * Disable it for this connection as too and
 			 * immediatly retry with MSS sized segments generated
 			 * by this function.
 			 */
 			if (tso)
 				tp->t_flags &= ~TF_TSO;
 			tcp_mtudisc(tp->t_inpcb, 0);
 			return (0);
 		case EHOSTDOWN:
 		case EHOSTUNREACH:
 		case ENETDOWN:
 		case ENETUNREACH:
 			if (TCPS_HAVERCVDSYN(tp->t_state)) {
 				tp->t_softerror = error;
 				return (0);
 			}
 			/* FALLTHROUGH */
 		default:
 			return (error);
 		}
 	}
 	tcpstat.tcps_sndtotal++;
 
 	/*
 	 * Data sent (as far as we can tell).
 	 * If this advertises a larger window than any other segment,
 	 * then remember the size of the advertised window.
 	 * Any pending ACK has now been sent.
 	 */
 	if (recwin > 0 && SEQ_GT(tp->rcv_nxt + recwin, tp->rcv_adv))
 		tp->rcv_adv = tp->rcv_nxt + recwin;
 	tp->last_ack_sent = tp->rcv_nxt;
 	tp->t_flags &= ~(TF_ACKNOW | TF_DELACK);
 	if (callout_active(tp->tt_delack))
 		callout_stop(tp->tt_delack);
 #if 0
 	/*
 	 * This completely breaks TCP if newreno is turned on.  What happens
 	 * is that if delayed-acks are turned on on the receiver, this code
 	 * on the transmitter effectively destroys the TCP window, forcing
 	 * it to four packets (1.5Kx4 = 6K window).
 	 */
 	if (sendalot && (!tcp_do_newreno || --maxburst))
 		goto again;
 #endif
 	if (sendalot)
 		goto again;
 	return (0);
 }
 
 void
 tcp_setpersist(tp)
 	register struct tcpcb *tp;
 {
 	int t = ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1;
 	int tt;
 
 	if (callout_active(tp->tt_rexmt))
 		panic("tcp_setpersist: retransmit pending");
 	/*
 	 * Start/restart persistance timer.
 	 */
 	TCPT_RANGESET(tt, t * tcp_backoff[tp->t_rxtshift],
 		      TCPTV_PERSMIN, TCPTV_PERSMAX);
 	callout_reset(tp->tt_persist, tt, tcp_timer_persist, tp);
 	if (tp->t_rxtshift < TCP_MAXRXTSHIFT)
 		tp->t_rxtshift++;
 }
 
 /*
  * Insert TCP options according to the supplied parameters to the place
  * optp in a consistent way.  Can handle unaligned destinations.
  *
  * The order of the option processing is crucial for optimal packing and
  * alignment for the scarce option space.
  *
  * The optimal order for a SYN/SYN-ACK segment is:
  *   MSS (4) + NOP (1) + Window scale (3) + SACK permitted (2) +
  *   Timestamp (10) + Signature (18) = 38 bytes out of a maximum of 40.
  *
  * The SACK options should be last.  SACK blocks consume 8*n+2 bytes.
  * So a full size SACK blocks option is 34 bytes (with 4 SACK blocks).
  * At minimum we need 10 bytes (to generate 1 SACK block).  If both
  * TCP Timestamps (12 bytes) and TCP Signatures (18 bytes) are present,
  * we only have 10 bytes for SACK options (40 - (12 + 18)).
  */
 int
 tcp_addoptions(struct tcpopt *to, u_char *optp)
 {
 	u_int mask, optlen = 0;
 
 	for (mask = 1; mask < TOF_MAXOPT; mask <<= 1) {
 		if ((to->to_flags & mask) != mask)
 			continue;
 		switch (to->to_flags & mask) {
 		case TOF_MSS:
 			while (optlen % 4) {
 				optlen += TCPOLEN_NOP;
 				*optp++ = TCPOPT_NOP;
 			}
 			optlen += TCPOLEN_MAXSEG;
 			*optp++ = TCPOPT_MAXSEG;
 			*optp++ = TCPOLEN_MAXSEG;
 			to->to_mss = htons(to->to_mss);
 			bcopy((u_char *)&to->to_mss, optp, sizeof(to->to_mss));
 			optp += sizeof(to->to_mss);
 			break;
 		case TOF_SCALE:
 			while (!optlen || optlen % 2 != 1) {
 				optlen += TCPOLEN_NOP;
 				*optp++ = TCPOPT_NOP;
 			}
 			optlen += TCPOLEN_WINDOW;
 			*optp++ = TCPOPT_WINDOW;
 			*optp++ = TCPOLEN_WINDOW;
 			*optp++ = to->to_wscale;
 			break;
 		case TOF_SACKPERM:
 			while (optlen % 2) {
 				optlen += TCPOLEN_NOP;
 				*optp++ = TCPOPT_NOP;
 			}
 			optlen += TCPOLEN_SACK_PERMITTED;
 			*optp++ = TCPOPT_SACK_PERMITTED;
 			*optp++ = TCPOLEN_SACK_PERMITTED;
 			break;
 		case TOF_TS:
 			while (!optlen || optlen % 4 != 2) {
 				optlen += TCPOLEN_NOP;
 				*optp++ = TCPOPT_NOP;
 			}
 			optlen += TCPOLEN_TIMESTAMP;
 			*optp++ = TCPOPT_TIMESTAMP;
 			*optp++ = TCPOLEN_TIMESTAMP;
 			to->to_tsval = htonl(to->to_tsval);
 			to->to_tsecr = htonl(to->to_tsecr);
 			bcopy((u_char *)&to->to_tsval, optp, sizeof(to->to_tsval));
 			optp += sizeof(to->to_tsval);
 			bcopy((u_char *)&to->to_tsecr, optp, sizeof(to->to_tsecr));
 			optp += sizeof(to->to_tsecr);
 			break;
 		case TOF_SIGNATURE:
 			{
 			int siglen = TCPOLEN_SIGNATURE - 2;
 
 			while (!optlen || optlen % 4 != 2) {
 				optlen += TCPOLEN_NOP;
 				*optp++ = TCPOPT_NOP;
 			}
 			if (MAX_TCPOPTLEN - optlen < TCPOLEN_SIGNATURE)
 				continue;
 			optlen += TCPOLEN_SIGNATURE;
 			*optp++ = TCPOPT_SIGNATURE;
 			*optp++ = TCPOLEN_SIGNATURE;
 			to->to_signature = optp;
 			while (siglen--)
 				 *optp++ = 0;
 			break;
 			}
 		case TOF_SACK:
 			{
 			int sackblks = 0;
 			struct sackblk *sack = (struct sackblk *)to->to_sacks;
 			tcp_seq sack_seq;
 
 			while (!optlen || optlen % 4 != 2) {
 				optlen += TCPOLEN_NOP;
 				*optp++ = TCPOPT_NOP;
 			}
 			if (MAX_TCPOPTLEN - optlen < 2 + TCPOLEN_SACK)
 				continue;
 			optlen += TCPOLEN_SACKHDR;
 			*optp++ = TCPOPT_SACK;
 			sackblks = min(to->to_nsacks,
 					(MAX_TCPOPTLEN - optlen) / TCPOLEN_SACK);
 			*optp++ = TCPOLEN_SACKHDR + sackblks * TCPOLEN_SACK;
 			while (sackblks--) {
 				sack_seq = htonl(sack->start);
 				bcopy((u_char *)&sack_seq, optp, sizeof(sack_seq));
 				optp += sizeof(sack_seq);
 				sack_seq = htonl(sack->end);
 				bcopy((u_char *)&sack_seq, optp, sizeof(sack_seq));
 				optp += sizeof(sack_seq);
 				optlen += TCPOLEN_SACK;
 				sack++;
 			}
 			tcpstat.tcps_sack_send_blocks++;
 			break;
 			}
 		default:
 			panic("%s: unknown TCP option type", __func__);
 			break;
 		}
 	}
 
 	/* Terminate and pad TCP options to a 4 byte boundary. */
 	if (optlen % 4) {
 		optlen += TCPOLEN_EOL;
 		*optp++ = TCPOPT_EOL;
 	}
 	while (optlen % 4) {
 		optlen += TCPOLEN_NOP;
 		*optp++ = TCPOPT_NOP;
 	}
 
 	KASSERT(optlen <= MAX_TCPOPTLEN, ("%s: TCP options too long", __func__));
 	return (optlen);
 }
Index: head/sys/sys/socketvar.h
===================================================================
--- head/sys/sys/socketvar.h	(revision 167714)
+++ head/sys/sys/socketvar.h	(revision 167715)
@@ -1,584 +1,594 @@
 /*-
  * Copyright (c) 1982, 1986, 1990, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)socketvar.h	8.3 (Berkeley) 2/19/95
  * $FreeBSD$
  */
 
 #ifndef _SYS_SOCKETVAR_H_
 #define _SYS_SOCKETVAR_H_
 
 #include <sys/queue.h>			/* for TAILQ macros */
 #include <sys/selinfo.h>		/* for struct selinfo */
 #include <sys/_lock.h>
 #include <sys/_mutex.h>
 
 /*
  * Kernel structure per socket.
  * Contains send and receive buffer queues,
  * handle on protocol and pointer to protocol
  * private data and error information.
  */
 typedef	u_quad_t so_gen_t;
 
 /*-
  * Locking key to struct socket:
  * (a) constant after allocation, no locking required.
  * (b) locked by SOCK_LOCK(so).
  * (c) locked by SOCKBUF_LOCK(&so->so_rcv).
  * (d) locked by SOCKBUF_LOCK(&so->so_snd).
  * (e) locked by ACCEPT_LOCK().
  * (f) not locked since integer reads/writes are atomic.
  * (g) used only as a sleep/wakeup address, no value.
  * (h) locked by global mutex so_global_mtx.
  */
 struct socket {
 	int	so_count;		/* (b) reference count */
 	short	so_type;		/* (a) generic type, see socket.h */
 	short	so_options;		/* from socket call, see socket.h */
 	short	so_linger;		/* time to linger while closing */
 	short	so_state;		/* (b) internal state flags SS_* */
 	int	so_qstate;		/* (e) internal state flags SQ_* */
 	void	*so_pcb;		/* protocol control block */
 	struct	protosw *so_proto;	/* (a) protocol handle */
 /*
  * Variables for connection queuing.
  * Socket where accepts occur is so_head in all subsidiary sockets.
  * If so_head is 0, socket is not related to an accept.
  * For head socket so_incomp queues partially completed connections,
  * while so_comp is a queue of connections ready to be accepted.
  * If a connection is aborted and it has so_head set, then
  * it has to be pulled out of either so_incomp or so_comp.
  * We allow connections to queue up based on current queue lengths
  * and limit on number of queued connections for this socket.
  */
 	struct	socket *so_head;	/* (e) back pointer to listen socket */
 	TAILQ_HEAD(, socket) so_incomp;	/* (e) queue of partial unaccepted connections */
 	TAILQ_HEAD(, socket) so_comp;	/* (e) queue of complete unaccepted connections */
 	TAILQ_ENTRY(socket) so_list;	/* (e) list of unaccepted connections */
 	u_short	so_qlen;		/* (e) number of unaccepted connections */
 	u_short	so_incqlen;		/* (e) number of unaccepted incomplete
 					   connections */
 	u_short	so_qlimit;		/* (e) max number queued connections */
 	short	so_timeo;		/* (g) connection timeout */
 	u_short	so_error;		/* (f) error affecting connection */
 	struct	sigio *so_sigio;	/* [sg] information for async I/O or
 					   out of band data (SIGURG) */
 	u_long	so_oobmark;		/* (c) chars to oob mark */
 	TAILQ_HEAD(, aiocblist) so_aiojobq; /* AIO ops waiting on socket */
 /*
  * Variables for socket buffering.
  */
 	struct sockbuf {
 		struct	selinfo sb_sel;	/* process selecting read/write */
 		struct	mtx sb_mtx;	/* sockbuf lock */
 		short	sb_state;	/* (c/d) socket state on sockbuf */
 #define	sb_startzero	sb_mb
 		struct	mbuf *sb_mb;	/* (c/d) the mbuf chain */
 		struct	mbuf *sb_mbtail; /* (c/d) the last mbuf in the chain */
 		struct	mbuf *sb_lastrecord;	/* (c/d) first mbuf of last
 						 * record in socket buffer */
+		struct	mbuf *sb_sndptr; /* (c/d) pointer into mbuf chain */
+		u_int	sb_sndptroff;	/* (c/d) byte offset of ptr into chain */
 		u_int	sb_cc;		/* (c/d) actual chars in buffer */
 		u_int	sb_hiwat;	/* (c/d) max actual char count */
 		u_int	sb_mbcnt;	/* (c/d) chars of mbufs used */
 		u_int	sb_mbmax;	/* (c/d) max chars of mbufs to use */
 		u_int	sb_ctl;		/* (c/d) non-data chars in buffer */
 		int	sb_lowat;	/* (c/d) low water mark */
 		int	sb_timeo;	/* (c/d) timeout for read/write */
 		short	sb_flags;	/* (c/d) flags, see below */
 	} so_rcv, so_snd;
 /*
  * Constants for sb_flags field of struct sockbuf.
  */
 #define	SB_MAX		(256*1024)	/* default for max chars in sockbuf */
 /*
  * Constants for sb_flags field of struct sockbuf.
  */
 #define	SB_LOCK		0x01		/* lock on data queue */
 #define	SB_WANT		0x02		/* someone is waiting to lock */
 #define	SB_WAIT		0x04		/* someone is waiting for data/space */
 #define	SB_SEL		0x08		/* someone is selecting */
 #define	SB_ASYNC	0x10		/* ASYNC I/O, need signals */
 #define	SB_UPCALL	0x20		/* someone wants an upcall */
 #define	SB_NOINTR	0x40		/* operations not interruptible */
 #define SB_AIO		0x80		/* AIO operations queued */
 #define SB_KNOTE	0x100		/* kernel note attached */
 #define	SB_AUTOSIZE	0x800		/* automatically size socket buffer */
 
 	void	(*so_upcall)(struct socket *, void *, int);
 	void	*so_upcallarg;
 	struct	ucred *so_cred;		/* (a) user credentials */
 	struct	label *so_label;	/* (b) MAC label for socket */
 	struct	label *so_peerlabel;	/* (b) cached MAC label for peer */
 	/* NB: generation count must not be first. */
 	so_gen_t so_gencnt;		/* (h) generation count */
 	void	*so_emuldata;		/* (b) private data for emulators */
  	struct so_accf {
 		struct	accept_filter *so_accept_filter;
 		void	*so_accept_filter_arg;	/* saved filter args */
 		char	*so_accept_filter_str;	/* saved user args */
 	} *so_accf;
 };
 
 #define SB_EMPTY_FIXUP(sb) do {						\
 	if ((sb)->sb_mb == NULL) {					\
 		(sb)->sb_mbtail = NULL;					\
 		(sb)->sb_lastrecord = NULL;				\
 	}								\
 } while (/*CONSTCOND*/0)
 
 /*
  * Global accept mutex to serialize access to accept queues and
  * fields associated with multiple sockets.  This allows us to
  * avoid defining a lock order between listen and accept sockets
  * until such time as it proves to be a good idea.
  */
 extern struct mtx accept_mtx;
 #define	ACCEPT_LOCK_ASSERT()		mtx_assert(&accept_mtx, MA_OWNED)
 #define	ACCEPT_UNLOCK_ASSERT()		mtx_assert(&accept_mtx, MA_NOTOWNED)
 #define	ACCEPT_LOCK()			mtx_lock(&accept_mtx)
 #define	ACCEPT_UNLOCK()			mtx_unlock(&accept_mtx)
 
 /*
  * Per-socket buffer mutex used to protect most fields in the socket
  * buffer.
  */
 #define	SOCKBUF_MTX(_sb)		(&(_sb)->sb_mtx)
 #define	SOCKBUF_LOCK_INIT(_sb, _name) \
 	mtx_init(SOCKBUF_MTX(_sb), _name, NULL, MTX_DEF)
 #define	SOCKBUF_LOCK_DESTROY(_sb)	mtx_destroy(SOCKBUF_MTX(_sb))
 #define	SOCKBUF_LOCK(_sb)		mtx_lock(SOCKBUF_MTX(_sb))
 #define	SOCKBUF_OWNED(_sb)		mtx_owned(SOCKBUF_MTX(_sb))
 #define	SOCKBUF_UNLOCK(_sb)		mtx_unlock(SOCKBUF_MTX(_sb))
 #define	SOCKBUF_LOCK_ASSERT(_sb)	mtx_assert(SOCKBUF_MTX(_sb), MA_OWNED)
 #define	SOCKBUF_UNLOCK_ASSERT(_sb)	mtx_assert(SOCKBUF_MTX(_sb), MA_NOTOWNED)
 
 /*
  * Per-socket mutex: we reuse the receive socket buffer mutex for space
  * efficiency.  This decision should probably be revisited as we optimize
  * locking for the socket code.
  */
 #define	SOCK_MTX(_so)			SOCKBUF_MTX(&(_so)->so_rcv)
 #define	SOCK_LOCK(_so)			SOCKBUF_LOCK(&(_so)->so_rcv)
 #define	SOCK_OWNED(_so)			SOCKBUF_OWNED(&(_so)->so_rcv)
 #define	SOCK_UNLOCK(_so)		SOCKBUF_UNLOCK(&(_so)->so_rcv)
 #define	SOCK_LOCK_ASSERT(_so)		SOCKBUF_LOCK_ASSERT(&(_so)->so_rcv)
 
 /*
  * Socket state bits.
  *
  * Historically, this bits were all kept in the so_state field.  For
  * locking reasons, they are now in multiple fields, as they are
  * locked differently.  so_state maintains basic socket state protected
  * by the socket lock.  so_qstate holds information about the socket
  * accept queues.  Each socket buffer also has a state field holding
  * information relevant to that socket buffer (can't send, rcv).  Many
  * fields will be read without locks to improve performance and avoid
  * lock order issues.  However, this approach must be used with caution.
  */
 #define	SS_NOFDREF		0x0001	/* no file table ref any more */
 #define	SS_ISCONNECTED		0x0002	/* socket connected to a peer */
 #define	SS_ISCONNECTING		0x0004	/* in process of connecting to peer */
 #define	SS_ISDISCONNECTING	0x0008	/* in process of disconnecting */
 #define	SS_NBIO			0x0100	/* non-blocking ops */
 #define	SS_ASYNC		0x0200	/* async i/o notify */
 #define	SS_ISCONFIRMING		0x0400	/* deciding to accept connection req */
 #define	SS_ISDISCONNECTED	0x2000	/* socket disconnected from peer */
 /*
  * Protocols can mark a socket as SS_PROTOREF to indicate that, following
  * pru_detach, they still want the socket to persist, and will free it
  * themselves when they are done.  Protocols should only ever call sofree()
  * following setting this flag in pru_detach(), and never otherwise, as
  * sofree() bypasses socket reference counting.
  */
 #define	SS_PROTOREF		0x4000	/* strong protocol reference */
 
 /*
  * Socket state bits now stored in the socket buffer state field.
  */
 #define	SBS_CANTSENDMORE	0x0010	/* can't send more data to peer */
 #define	SBS_CANTRCVMORE		0x0020	/* can't receive more data from peer */
 #define	SBS_RCVATMARK		0x0040	/* at mark on input */
 
 /*
  * Socket state bits stored in so_qstate.
  */
 #define	SQ_INCOMP		0x0800	/* unaccepted, incomplete connection */
 #define	SQ_COMP			0x1000	/* unaccepted, complete connection */
 
 /*
  * Externalized form of struct socket used by the sysctl(3) interface.
  */
 struct xsocket {
 	size_t	xso_len;	/* length of this structure */
 	struct	socket *xso_so;	/* makes a convenient handle sometimes */
 	short	so_type;
 	short	so_options;
 	short	so_linger;
 	short	so_state;
 	caddr_t	so_pcb;		/* another convenient handle */
 	int	xso_protocol;
 	int	xso_family;
 	u_short	so_qlen;
 	u_short	so_incqlen;
 	u_short	so_qlimit;
 	short	so_timeo;
 	u_short	so_error;
 	pid_t	so_pgid;
 	u_long	so_oobmark;
 	struct xsockbuf {
 		u_int	sb_cc;
 		u_int	sb_hiwat;
 		u_int	sb_mbcnt;
 		u_int	sb_mbmax;
 		int	sb_lowat;
 		int	sb_timeo;
 		short	sb_flags;
 	} so_rcv, so_snd;
 	uid_t	so_uid;		/* XXX */
 };
 
 #ifdef _KERNEL
 
 /*
  * Macros for sockets and socket buffering.
  */
 
 /*
  * Do we need to notify the other side when I/O is possible?
  */
 #define	sb_notify(sb)	(((sb)->sb_flags & (SB_WAIT | SB_SEL | SB_ASYNC | \
     SB_UPCALL | SB_AIO | SB_KNOTE)) != 0)
 
 /*
  * How much space is there in a socket buffer (so->so_snd or so->so_rcv)?
  * This is problematical if the fields are unsigned, as the space might
  * still be negative (cc > hiwat or mbcnt > mbmax).  Should detect
  * overflow and return 0.  Should use "lmin" but it doesn't exist now.
  */
 #define	sbspace(sb) \
     ((long) imin((int)((sb)->sb_hiwat - (sb)->sb_cc), \
 	 (int)((sb)->sb_mbmax - (sb)->sb_mbcnt)))
 
 /* do we have to send all at once on a socket? */
 #define	sosendallatonce(so) \
     ((so)->so_proto->pr_flags & PR_ATOMIC)
 
 /* can we read something from so? */
 #define	soreadable(so) \
     ((so)->so_rcv.sb_cc >= (so)->so_rcv.sb_lowat || \
 	((so)->so_rcv.sb_state & SBS_CANTRCVMORE) || \
 	!TAILQ_EMPTY(&(so)->so_comp) || (so)->so_error)
 
 /* can we write something to so? */
 #define	sowriteable(so) \
     ((sbspace(&(so)->so_snd) >= (so)->so_snd.sb_lowat && \
 	(((so)->so_state&SS_ISCONNECTED) || \
 	  ((so)->so_proto->pr_flags&PR_CONNREQUIRED)==0)) || \
      ((so)->so_snd.sb_state & SBS_CANTSENDMORE) || \
      (so)->so_error)
 
 /* adjust counters in sb reflecting allocation of m */
 #define	sballoc(sb, m) { \
 	(sb)->sb_cc += (m)->m_len; \
 	if ((m)->m_type != MT_DATA && (m)->m_type != MT_OOBDATA) \
 		(sb)->sb_ctl += (m)->m_len; \
 	(sb)->sb_mbcnt += MSIZE; \
 	if ((m)->m_flags & M_EXT) \
 		(sb)->sb_mbcnt += (m)->m_ext.ext_size; \
 }
 
 /* adjust counters in sb reflecting freeing of m */
 #define	sbfree(sb, m) { \
 	(sb)->sb_cc -= (m)->m_len; \
 	if ((m)->m_type != MT_DATA && (m)->m_type != MT_OOBDATA) \
 		(sb)->sb_ctl -= (m)->m_len; \
 	(sb)->sb_mbcnt -= MSIZE; \
 	if ((m)->m_flags & M_EXT) \
 		(sb)->sb_mbcnt -= (m)->m_ext.ext_size; \
+	if ((sb)->sb_sndptr == (m)) { \
+		(sb)->sb_sndptr = NULL; \
+		(sb)->sb_sndptroff = 0; \
+	} \
+	if ((sb)->sb_sndptroff != 0) \
+		(sb)->sb_sndptroff -= (m)->m_len; \
 }
 
 /*
  * Set lock on sockbuf sb; sleep if lock is already held.
  * Unless SB_NOINTR is set on sockbuf, sleep is interruptible.
  * Returns error without lock if sleep is interrupted.
  */
 #define sblock(sb, wf) ((sb)->sb_flags & SB_LOCK ? \
 		(((wf) == M_WAITOK) ? sb_lock(sb) : EWOULDBLOCK) : \
 		((sb)->sb_flags |= SB_LOCK), 0)
 
 /* release lock on sockbuf sb */
 #define	sbunlock(sb) do { \
 	SOCKBUF_LOCK_ASSERT(sb); \
 	(sb)->sb_flags &= ~SB_LOCK; \
 	if ((sb)->sb_flags & SB_WANT) { \
 		(sb)->sb_flags &= ~SB_WANT; \
 		wakeup(&(sb)->sb_flags); \
 	} \
 } while (0)
 
 /*
  * soref()/sorele() ref-count the socket structure.  Note that you must
  * still explicitly close the socket, but the last ref count will free
  * the structure.
  */
 #define	soref(so) do {							\
 	SOCK_LOCK_ASSERT(so);						\
 	++(so)->so_count;						\
 } while (0)
 
 #define	sorele(so) do {							\
 	ACCEPT_LOCK_ASSERT();						\
 	SOCK_LOCK_ASSERT(so);						\
 	if ((so)->so_count <= 0)					\
 		panic("sorele");					\
 	if (--(so)->so_count == 0)					\
 		sofree(so);						\
 	else {								\
 		SOCK_UNLOCK(so);					\
 		ACCEPT_UNLOCK();					\
 	}								\
 } while (0)
 
 #define	sotryfree(so) do {						\
 	ACCEPT_LOCK_ASSERT();						\
 	SOCK_LOCK_ASSERT(so);						\
 	if ((so)->so_count == 0)					\
 		sofree(so);						\
 	else {								\
 		SOCK_UNLOCK(so);					\
 		ACCEPT_UNLOCK();					\
 	}								\
 } while(0)
 
 /*
  * In sorwakeup() and sowwakeup(), acquire the socket buffer lock to
  * avoid a non-atomic test-and-wakeup.  However, sowakeup is
  * responsible for releasing the lock if it is called.  We unlock only
  * if we don't call into sowakeup.  If any code is introduced that
  * directly invokes the underlying sowakeup() primitives, it must
  * maintain the same semantics.
  */
 #define	sorwakeup_locked(so) do {					\
 	SOCKBUF_LOCK_ASSERT(&(so)->so_rcv);				\
 	if (sb_notify(&(so)->so_rcv))					\
 		sowakeup((so), &(so)->so_rcv);	 			\
 	else								\
 		SOCKBUF_UNLOCK(&(so)->so_rcv);				\
 } while (0)
 
 #define	sorwakeup(so) do {						\
 	SOCKBUF_LOCK(&(so)->so_rcv);					\
 	sorwakeup_locked(so);						\
 } while (0)
 
 #define	sowwakeup_locked(so) do {					\
 	SOCKBUF_LOCK_ASSERT(&(so)->so_snd);				\
 	if (sb_notify(&(so)->so_snd))					\
 		sowakeup((so), &(so)->so_snd); 				\
 	else								\
 		SOCKBUF_UNLOCK(&(so)->so_snd);				\
 } while (0)
 
 #define	sowwakeup(so) do {						\
 	SOCKBUF_LOCK(&(so)->so_snd);					\
 	sowwakeup_locked(so);						\
 } while (0)
 
 /*
  * Argument structure for sosetopt et seq.  This is in the KERNEL
  * section because it will never be visible to user code.
  */
 enum sopt_dir { SOPT_GET, SOPT_SET };
 struct sockopt {
 	enum	sopt_dir sopt_dir; /* is this a get or a set? */
 	int	sopt_level;	/* second arg of [gs]etsockopt */
 	int	sopt_name;	/* third arg of [gs]etsockopt */
 	void   *sopt_val;	/* fourth arg of [gs]etsockopt */
 	size_t	sopt_valsize;	/* (almost) fifth arg of [gs]etsockopt */
 	struct	thread *sopt_td; /* calling thread or null if kernel */
 };
 
 struct accept_filter {
 	char	accf_name[16];
 	void	(*accf_callback)
 		(struct socket *so, void *arg, int waitflag);
 	void *	(*accf_create)
 		(struct socket *so, char *arg);
 	void	(*accf_destroy)
 		(struct socket *so);
 	SLIST_ENTRY(accept_filter) accf_next;
 };
 
 #ifdef MALLOC_DECLARE
 MALLOC_DECLARE(M_ACCF);
 MALLOC_DECLARE(M_PCB);
 MALLOC_DECLARE(M_SONAME);
 #endif
 
 extern int	maxsockets;
 extern u_long	sb_max;
 extern struct uma_zone *socket_zone;
 extern so_gen_t so_gencnt;
 
 struct mbuf;
 struct sockaddr;
 struct ucred;
 struct uio;
 
 /*
  * From uipc_socket and friends
  */
 int	do_getopt_accept_filter(struct socket *so, struct sockopt *sopt);
 int	do_setopt_accept_filter(struct socket *so, struct sockopt *sopt);
 int	so_setsockopt(struct socket *so, int level, int optname,
 	    void *optval, size_t optlen);
 int	sockargs(struct mbuf **mp, caddr_t buf, int buflen, int type);
 int	getsockaddr(struct sockaddr **namp, caddr_t uaddr, size_t len);
 void	sbappend(struct sockbuf *sb, struct mbuf *m);
 void	sbappend_locked(struct sockbuf *sb, struct mbuf *m);
 void	sbappendstream(struct sockbuf *sb, struct mbuf *m);
 void	sbappendstream_locked(struct sockbuf *sb, struct mbuf *m);
 int	sbappendaddr(struct sockbuf *sb, const struct sockaddr *asa,
 	    struct mbuf *m0, struct mbuf *control);
 int	sbappendaddr_locked(struct sockbuf *sb, const struct sockaddr *asa,
 	    struct mbuf *m0, struct mbuf *control);
 int	sbappendcontrol(struct sockbuf *sb, struct mbuf *m0,
 	    struct mbuf *control);
 int	sbappendcontrol_locked(struct sockbuf *sb, struct mbuf *m0,
 	    struct mbuf *control);
 void	sbappendrecord(struct sockbuf *sb, struct mbuf *m0);
 void	sbappendrecord_locked(struct sockbuf *sb, struct mbuf *m0);
 void	sbcheck(struct sockbuf *sb);
 void	sbcompress(struct sockbuf *sb, struct mbuf *m, struct mbuf *n);
 struct mbuf *
 	sbcreatecontrol(caddr_t p, int size, int type, int level);
 void	sbdestroy(struct sockbuf *sb, struct socket *so);
 void	sbdrop(struct sockbuf *sb, int len);
 void	sbdrop_locked(struct sockbuf *sb, int len);
 void	sbdroprecord(struct sockbuf *sb);
 void	sbdroprecord_locked(struct sockbuf *sb);
 void	sbflush(struct sockbuf *sb);
 void	sbflush_locked(struct sockbuf *sb);
 void	sbrelease(struct sockbuf *sb, struct socket *so);
 void	sbrelease_locked(struct sockbuf *sb, struct socket *so);
 int	sbreserve(struct sockbuf *sb, u_long cc, struct socket *so,
 	    struct thread *td);
 int	sbreserve_locked(struct sockbuf *sb, u_long cc, struct socket *so,
 	    struct thread *td);
+struct mbuf *
+	sbsndptr(struct sockbuf *sb, u_int off, u_int len, u_int *moff);
 void	sbtoxsockbuf(struct sockbuf *sb, struct xsockbuf *xsb);
 int	sbwait(struct sockbuf *sb);
 int	sb_lock(struct sockbuf *sb);
 void	soabort(struct socket *so);
 int	soaccept(struct socket *so, struct sockaddr **nam);
 int	socheckuid(struct socket *so, uid_t uid);
 int	sobind(struct socket *so, struct sockaddr *nam, struct thread *td);
 void	socantrcvmore(struct socket *so);
 void	socantrcvmore_locked(struct socket *so);
 void	socantsendmore(struct socket *so);
 void	socantsendmore_locked(struct socket *so);
 int	soclose(struct socket *so);
 int	soconnect(struct socket *so, struct sockaddr *nam, struct thread *td);
 int	soconnect2(struct socket *so1, struct socket *so2);
 int	socow_setup(struct mbuf *m0, struct uio *uio);
 int	socreate(int dom, struct socket **aso, int type, int proto,
 	    struct ucred *cred, struct thread *td);
 int	sodisconnect(struct socket *so);
 struct	sockaddr *sodupsockaddr(const struct sockaddr *sa, int mflags);
 void	sofree(struct socket *so);
 int	sogetopt(struct socket *so, struct sockopt *sopt);
 void	sohasoutofband(struct socket *so);
 void	soisconnected(struct socket *so);
 void	soisconnecting(struct socket *so);
 void	soisdisconnected(struct socket *so);
 void	soisdisconnecting(struct socket *so);
 int	solisten(struct socket *so, int backlog, struct thread *td);
 void	solisten_proto(struct socket *so, int backlog);
 int	solisten_proto_check(struct socket *so);
 struct socket *
 	sonewconn(struct socket *head, int connstatus);
 int	sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen);
 int	sooptcopyout(struct sockopt *sopt, const void *buf, size_t len);
 
 /* XXX; prepare mbuf for (__FreeBSD__ < 3) routines. */
 int	soopt_getm(struct sockopt *sopt, struct mbuf **mp);
 int	soopt_mcopyin(struct sockopt *sopt, struct mbuf *m);
 int	soopt_mcopyout(struct sockopt *sopt, struct mbuf *m);
 
 int	sopoll(struct socket *so, int events, struct ucred *active_cred,
 	    struct thread *td);
 int	sopoll_generic(struct socket *so, int events,
 	    struct ucred *active_cred, struct thread *td);
 int	soreceive(struct socket *so, struct sockaddr **paddr, struct uio *uio,
 	    struct mbuf **mp0, struct mbuf **controlp, int *flagsp);
 int	soreceive_generic(struct socket *so, struct sockaddr **paddr,
 	    struct uio *uio, struct mbuf **mp0, struct mbuf **controlp,
 	    int *flagsp);
 int	soreserve(struct socket *so, u_long sndcc, u_long rcvcc);
 void	sorflush(struct socket *so);
 int	sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
 	    struct mbuf *top, struct mbuf *control, int flags,
 	    struct thread *td);
 int	sosend_dgram(struct socket *so, struct sockaddr *addr,
 	    struct uio *uio, struct mbuf *top, struct mbuf *control,
 	    int flags, struct thread *td);
 int	sosend_generic(struct socket *so, struct sockaddr *addr,
 	    struct uio *uio, struct mbuf *top, struct mbuf *control,
 	    int flags, struct thread *td);
 int	sosetopt(struct socket *so, struct sockopt *sopt);
 int	soshutdown(struct socket *so, int how);
 void	sotoxsocket(struct socket *so, struct xsocket *xso);
 void	sowakeup(struct socket *so, struct sockbuf *sb);
 
 #ifdef SOCKBUF_DEBUG
 void	sblastrecordchk(struct sockbuf *, const char *, int);
 #define	SBLASTRECORDCHK(sb)	sblastrecordchk((sb), __FILE__, __LINE__)
 
 void	sblastmbufchk(struct sockbuf *, const char *, int);
 #define	SBLASTMBUFCHK(sb)	sblastmbufchk((sb), __FILE__, __LINE__)
 #else
 #define	SBLASTRECORDCHK(sb)      /* nothing */
 #define	SBLASTMBUFCHK(sb)        /* nothing */
 #endif /* SOCKBUF_DEBUG */
 
 /*
  * Accept filter functions (duh).
  */
 int	accept_filt_add(struct accept_filter *filt);
 int	accept_filt_del(char *name);
 struct	accept_filter *accept_filt_get(char *name);
 #ifdef ACCEPT_FILTER_MOD
 #ifdef SYSCTL_DECL
 SYSCTL_DECL(_net_inet_accf);
 #endif
 int	accept_filt_generic_mod_event(module_t mod, int event, void *data);
 #endif
 
 #endif /* _KERNEL */
 
 #endif /* !_SYS_SOCKETVAR_H_ */