Index: head/sys/kern/uipc_sockbuf.c =================================================================== --- head/sys/kern/uipc_sockbuf.c (revision 334803) +++ head/sys/kern/uipc_sockbuf.c (revision 334804) @@ -1,1416 +1,1465 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1988, 1990, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)uipc_socket2.c 8.1 (Berkeley) 6/10/93 */ #include __FBSDID("$FreeBSD$"); #include "opt_param.h" #include #include /* for aio_swake proto */ #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Function pointer set by the AIO routines so that the socket buffer code * can call back into the AIO module if it is loaded. */ void (*aio_swake)(struct socket *, struct sockbuf *); /* * Primitive routines for operating on socket buffers */ u_long sb_max = SB_MAX; u_long sb_max_adj = (quad_t)SB_MAX * MCLBYTES / (MSIZE + MCLBYTES); /* adjusted sb_max */ static u_long sb_efficiency = 8; /* parameter for sbreserve() */ static struct mbuf *sbcut_internal(struct sockbuf *sb, int len); static void sbflush_internal(struct sockbuf *sb); /* * Our own version of m_clrprotoflags(), that can preserve M_NOTREADY. */ static void sbm_clrprotoflags(struct mbuf *m, int flags) { int mask; mask = ~M_PROTOFLAGS; if (flags & PRUS_NOTREADY) mask |= M_NOTREADY; while (m) { m->m_flags &= mask; m = m->m_next; } } /* * Mark ready "count" mbufs starting with "m". */ int sbready(struct sockbuf *sb, struct mbuf *m, int count) { u_int blocker; SOCKBUF_LOCK_ASSERT(sb); KASSERT(sb->sb_fnrdy != NULL, ("%s: sb %p NULL fnrdy", __func__, sb)); blocker = (sb->sb_fnrdy == m) ? M_BLOCKED : 0; for (int i = 0; i < count; i++, m = m->m_next) { KASSERT(m->m_flags & M_NOTREADY, ("%s: m %p !M_NOTREADY", __func__, m)); m->m_flags &= ~(M_NOTREADY | blocker); if (blocker) sb->sb_acc += m->m_len; } if (!blocker) return (EINPROGRESS); /* This one was blocking all the queue. */ for (; m && (m->m_flags & M_NOTREADY) == 0; m = m->m_next) { KASSERT(m->m_flags & M_BLOCKED, ("%s: m %p !M_BLOCKED", __func__, m)); m->m_flags &= ~M_BLOCKED; sb->sb_acc += m->m_len; } sb->sb_fnrdy = m; return (0); } /* * Adjust sockbuf state reflecting allocation of m. */ void sballoc(struct sockbuf *sb, struct mbuf *m) { SOCKBUF_LOCK_ASSERT(sb); sb->sb_ccc += m->m_len; if (sb->sb_fnrdy == NULL) { if (m->m_flags & M_NOTREADY) sb->sb_fnrdy = m; else sb->sb_acc += m->m_len; } else m->m_flags |= M_BLOCKED; if (m->m_type != MT_DATA && m->m_type != MT_OOBDATA) sb->sb_ctl += m->m_len; sb->sb_mbcnt += MSIZE; sb->sb_mcnt += 1; if (m->m_flags & M_EXT) { sb->sb_mbcnt += m->m_ext.ext_size; sb->sb_ccnt += 1; } } /* * Adjust sockbuf state reflecting freeing of m. */ void sbfree(struct sockbuf *sb, struct mbuf *m) { #if 0 /* XXX: not yet: soclose() call path comes here w/o lock. */ SOCKBUF_LOCK_ASSERT(sb); #endif sb->sb_ccc -= m->m_len; if (!(m->m_flags & M_NOTAVAIL)) sb->sb_acc -= m->m_len; if (m == sb->sb_fnrdy) { struct mbuf *n; KASSERT(m->m_flags & M_NOTREADY, ("%s: m %p !M_NOTREADY", __func__, m)); n = m->m_next; while (n != NULL && !(n->m_flags & M_NOTREADY)) { n->m_flags &= ~M_BLOCKED; sb->sb_acc += n->m_len; n = n->m_next; } sb->sb_fnrdy = n; } if (m->m_type != MT_DATA && m->m_type != MT_OOBDATA) sb->sb_ctl -= m->m_len; sb->sb_mbcnt -= MSIZE; sb->sb_mcnt -= 1; if (m->m_flags & M_EXT) { sb->sb_mbcnt -= m->m_ext.ext_size; sb->sb_ccnt -= 1; } if (sb->sb_sndptr == m) { sb->sb_sndptr = NULL; sb->sb_sndptroff = 0; } if (sb->sb_sndptroff != 0) sb->sb_sndptroff -= m->m_len; } /* * Socantsendmore indicates that no more data will be sent on the socket; it * would normally be applied to a socket when the user informs the system * that no more data is to be sent, by the protocol code (in case * PRU_SHUTDOWN). Socantrcvmore indicates that no more data will be * received, and will normally be applied to the socket by a protocol when it * detects that the peer will send no more data. Data queued for reading in * the socket may yet be read. */ void socantsendmore_locked(struct socket *so) { SOCKBUF_LOCK_ASSERT(&so->so_snd); so->so_snd.sb_state |= SBS_CANTSENDMORE; sowwakeup_locked(so); mtx_assert(SOCKBUF_MTX(&so->so_snd), MA_NOTOWNED); } void socantsendmore(struct socket *so) { SOCKBUF_LOCK(&so->so_snd); socantsendmore_locked(so); mtx_assert(SOCKBUF_MTX(&so->so_snd), MA_NOTOWNED); } void socantrcvmore_locked(struct socket *so) { SOCKBUF_LOCK_ASSERT(&so->so_rcv); so->so_rcv.sb_state |= SBS_CANTRCVMORE; sorwakeup_locked(so); mtx_assert(SOCKBUF_MTX(&so->so_rcv), MA_NOTOWNED); } void socantrcvmore(struct socket *so) { SOCKBUF_LOCK(&so->so_rcv); socantrcvmore_locked(so); mtx_assert(SOCKBUF_MTX(&so->so_rcv), MA_NOTOWNED); } /* * Wait for data to arrive at/drain from a socket buffer. */ int sbwait(struct sockbuf *sb) { SOCKBUF_LOCK_ASSERT(sb); sb->sb_flags |= SB_WAIT; return (msleep_sbt(&sb->sb_acc, &sb->sb_mtx, (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK | PCATCH, "sbwait", sb->sb_timeo, 0, 0)); } int sblock(struct sockbuf *sb, int flags) { KASSERT((flags & SBL_VALID) == flags, ("sblock: flags invalid (0x%x)", flags)); if (flags & SBL_WAIT) { if ((sb->sb_flags & SB_NOINTR) || (flags & SBL_NOINTR)) { sx_xlock(&sb->sb_sx); return (0); } return (sx_xlock_sig(&sb->sb_sx)); } else { if (sx_try_xlock(&sb->sb_sx) == 0) return (EWOULDBLOCK); return (0); } } void sbunlock(struct sockbuf *sb) { sx_xunlock(&sb->sb_sx); } /* * Wakeup processes waiting on a socket buffer. Do asynchronous notification * via SIGIO if the socket has the SS_ASYNC flag set. * * Called with the socket buffer lock held; will release the lock by the end * of the function. This allows the caller to acquire the socket buffer lock * while testing for the need for various sorts of wakeup and hold it through * to the point where it's no longer required. We currently hold the lock * through calls out to other subsystems (with the exception of kqueue), and * then release it to avoid lock order issues. It's not clear that's * correct. */ void sowakeup(struct socket *so, struct sockbuf *sb) { int ret; SOCKBUF_LOCK_ASSERT(sb); selwakeuppri(sb->sb_sel, PSOCK); if (!SEL_WAITING(sb->sb_sel)) sb->sb_flags &= ~SB_SEL; if (sb->sb_flags & SB_WAIT) { sb->sb_flags &= ~SB_WAIT; wakeup(&sb->sb_acc); } KNOTE_LOCKED(&sb->sb_sel->si_note, 0); if (sb->sb_upcall != NULL) { ret = sb->sb_upcall(so, sb->sb_upcallarg, M_NOWAIT); if (ret == SU_ISCONNECTED) { KASSERT(sb == &so->so_rcv, ("SO_SND upcall returned SU_ISCONNECTED")); soupcall_clear(so, SO_RCV); } } else ret = SU_OK; if (sb->sb_flags & SB_AIO) sowakeup_aio(so, sb); SOCKBUF_UNLOCK(sb); if (ret == SU_ISCONNECTED) soisconnected(so); if ((so->so_state & SS_ASYNC) && so->so_sigio != NULL) pgsigio(&so->so_sigio, SIGIO, 0); mtx_assert(SOCKBUF_MTX(sb), MA_NOTOWNED); } /* * Socket buffer (struct sockbuf) utility routines. * * Each socket contains two socket buffers: one for sending data and one for * receiving data. Each buffer contains a queue of mbufs, information about * the number of mbufs and amount of data in the queue, and other fields * allowing select() statements and notification on data availability to be * implemented. * * Data stored in a socket buffer is maintained as a list of records. Each * record is a list of mbufs chained together with the m_next field. Records * are chained together with the m_nextpkt field. The upper level routine * soreceive() expects the following conventions to be observed when placing * information in the receive buffer: * * 1. If the protocol requires each message be preceded by the sender's name, * then a record containing that name must be present before any * associated data (mbuf's must be of type MT_SONAME). * 2. If the protocol supports the exchange of ``access rights'' (really just * additional data associated with the message), and there are ``rights'' * to be received, then a record containing this data should be present * (mbuf's must be of type MT_RIGHTS). * 3. If a name or rights record exists, then it must be followed by a data * record, perhaps of zero length. * * Before using a new socket structure it is first necessary to reserve * buffer space to the socket, by calling sbreserve(). This should commit * some of the available buffer space in the system buffer pool for the * socket (currently, it does nothing but enforce limits). The space should * be released by calling sbrelease() when the socket is destroyed. */ int soreserve(struct socket *so, u_long sndcc, u_long rcvcc) { struct thread *td = curthread; SOCKBUF_LOCK(&so->so_snd); SOCKBUF_LOCK(&so->so_rcv); if (sbreserve_locked(&so->so_snd, sndcc, so, td) == 0) goto bad; if (sbreserve_locked(&so->so_rcv, rcvcc, so, td) == 0) goto bad2; if (so->so_rcv.sb_lowat == 0) so->so_rcv.sb_lowat = 1; if (so->so_snd.sb_lowat == 0) so->so_snd.sb_lowat = MCLBYTES; if (so->so_snd.sb_lowat > so->so_snd.sb_hiwat) so->so_snd.sb_lowat = so->so_snd.sb_hiwat; SOCKBUF_UNLOCK(&so->so_rcv); SOCKBUF_UNLOCK(&so->so_snd); return (0); bad2: sbrelease_locked(&so->so_snd, so); bad: SOCKBUF_UNLOCK(&so->so_rcv); SOCKBUF_UNLOCK(&so->so_snd); return (ENOBUFS); } static int sysctl_handle_sb_max(SYSCTL_HANDLER_ARGS) { int error = 0; u_long tmp_sb_max = sb_max; error = sysctl_handle_long(oidp, &tmp_sb_max, arg2, req); if (error || !req->newptr) return (error); if (tmp_sb_max < MSIZE + MCLBYTES) return (EINVAL); sb_max = tmp_sb_max; sb_max_adj = (u_quad_t)sb_max * MCLBYTES / (MSIZE + MCLBYTES); return (0); } /* * Allot mbufs to a sockbuf. Attempt to scale mbmax so that mbcnt doesn't * become limiting if buffering efficiency is near the normal case. */ int sbreserve_locked(struct sockbuf *sb, u_long cc, struct socket *so, struct thread *td) { rlim_t sbsize_limit; SOCKBUF_LOCK_ASSERT(sb); /* * When a thread is passed, we take into account the thread's socket * buffer size limit. The caller will generally pass curthread, but * in the TCP input path, NULL will be passed to indicate that no * appropriate thread resource limits are available. In that case, * we don't apply a process limit. */ if (cc > sb_max_adj) return (0); if (td != NULL) { sbsize_limit = lim_cur(td, RLIMIT_SBSIZE); } else sbsize_limit = RLIM_INFINITY; if (!chgsbsize(so->so_cred->cr_uidinfo, &sb->sb_hiwat, cc, sbsize_limit)) return (0); sb->sb_mbmax = min(cc * sb_efficiency, sb_max); if (sb->sb_lowat > sb->sb_hiwat) sb->sb_lowat = sb->sb_hiwat; return (1); } int sbsetopt(struct socket *so, int cmd, u_long cc) { struct sockbuf *sb; short *flags; u_int *hiwat, *lowat; int error; sb = NULL; SOCK_LOCK(so); if (SOLISTENING(so)) { switch (cmd) { case SO_SNDLOWAT: case SO_SNDBUF: lowat = &so->sol_sbsnd_lowat; hiwat = &so->sol_sbsnd_hiwat; flags = &so->sol_sbsnd_flags; break; case SO_RCVLOWAT: case SO_RCVBUF: lowat = &so->sol_sbrcv_lowat; hiwat = &so->sol_sbrcv_hiwat; flags = &so->sol_sbrcv_flags; break; } } else { switch (cmd) { case SO_SNDLOWAT: case SO_SNDBUF: sb = &so->so_snd; break; case SO_RCVLOWAT: case SO_RCVBUF: sb = &so->so_rcv; break; } flags = &sb->sb_flags; hiwat = &sb->sb_hiwat; lowat = &sb->sb_lowat; SOCKBUF_LOCK(sb); } error = 0; switch (cmd) { case SO_SNDBUF: case SO_RCVBUF: if (SOLISTENING(so)) { if (cc > sb_max_adj) { error = ENOBUFS; break; } *hiwat = cc; if (*lowat > *hiwat) *lowat = *hiwat; } else { if (!sbreserve_locked(sb, cc, so, curthread)) error = ENOBUFS; } if (error == 0) *flags &= ~SB_AUTOSIZE; break; case SO_SNDLOWAT: case SO_RCVLOWAT: /* * Make sure the low-water is never greater than the * high-water. */ *lowat = (cc > *hiwat) ? *hiwat : cc; break; } if (!SOLISTENING(so)) SOCKBUF_UNLOCK(sb); SOCK_UNLOCK(so); return (error); } /* * Free mbufs held by a socket, and reserved mbuf space. */ void sbrelease_internal(struct sockbuf *sb, struct socket *so) { sbflush_internal(sb); (void)chgsbsize(so->so_cred->cr_uidinfo, &sb->sb_hiwat, 0, RLIM_INFINITY); sb->sb_mbmax = 0; } void sbrelease_locked(struct sockbuf *sb, struct socket *so) { SOCKBUF_LOCK_ASSERT(sb); sbrelease_internal(sb, so); } void sbrelease(struct sockbuf *sb, struct socket *so) { SOCKBUF_LOCK(sb); sbrelease_locked(sb, so); SOCKBUF_UNLOCK(sb); } void sbdestroy(struct sockbuf *sb, struct socket *so) { sbrelease_internal(sb, so); } /* * Routines to add and remove data from an mbuf queue. * * The routines sbappend() or sbappendrecord() are normally called to append * new mbufs to a socket buffer, after checking that adequate space is * available, comparing the function sbspace() with the amount of data to be * added. sbappendrecord() differs from sbappend() in that data supplied is * treated as the beginning of a new record. To place a sender's address, * optional access rights, and data in a socket receive buffer, * sbappendaddr() should be used. To place access rights and data in a * socket receive buffer, sbappendrights() should be used. In either case, * the new data begins a new record. Note that unlike sbappend() and * sbappendrecord(), these routines check for the caller that there will be * enough space to store the data. Each fails if there is not enough space, * or if it cannot find mbufs to store additional information in. * * Reliable protocols may use the socket send buffer to hold data awaiting * acknowledgement. Data is normally copied from a socket send buffer in a * protocol with m_copy for output to a peer, and then removing the data from * the socket buffer with sbdrop() or sbdroprecord() when the data is * acknowledged by the peer. */ #ifdef SOCKBUF_DEBUG void sblastrecordchk(struct sockbuf *sb, const char *file, int line) { struct mbuf *m = sb->sb_mb; SOCKBUF_LOCK_ASSERT(sb); while (m && m->m_nextpkt) m = m->m_nextpkt; if (m != sb->sb_lastrecord) { printf("%s: sb_mb %p sb_lastrecord %p last %p\n", __func__, sb->sb_mb, sb->sb_lastrecord, m); printf("packet chain:\n"); for (m = sb->sb_mb; m != NULL; m = m->m_nextpkt) printf("\t%p\n", m); panic("%s from %s:%u", __func__, file, line); } } void sblastmbufchk(struct sockbuf *sb, const char *file, int line) { struct mbuf *m = sb->sb_mb; struct mbuf *n; SOCKBUF_LOCK_ASSERT(sb); while (m && m->m_nextpkt) m = m->m_nextpkt; while (m && m->m_next) m = m->m_next; if (m != sb->sb_mbtail) { printf("%s: sb_mb %p sb_mbtail %p last %p\n", __func__, sb->sb_mb, sb->sb_mbtail, m); printf("packet tree:\n"); for (m = sb->sb_mb; m != NULL; m = m->m_nextpkt) { printf("\t"); for (n = m; n != NULL; n = n->m_next) printf("%p ", n); printf("\n"); } panic("%s from %s:%u", __func__, file, line); } } #endif /* SOCKBUF_DEBUG */ #define SBLINKRECORD(sb, m0) do { \ SOCKBUF_LOCK_ASSERT(sb); \ if ((sb)->sb_lastrecord != NULL) \ (sb)->sb_lastrecord->m_nextpkt = (m0); \ else \ (sb)->sb_mb = (m0); \ (sb)->sb_lastrecord = (m0); \ } while (/*CONSTCOND*/0) /* * Append mbuf chain m to the last record in the socket buffer sb. The * additional space associated the mbuf chain is recorded in sb. Empty mbufs * are discarded and mbufs are compacted where possible. */ void sbappend_locked(struct sockbuf *sb, struct mbuf *m, int flags) { struct mbuf *n; SOCKBUF_LOCK_ASSERT(sb); if (m == NULL) return; sbm_clrprotoflags(m, flags); SBLASTRECORDCHK(sb); n = sb->sb_mb; if (n) { while (n->m_nextpkt) n = n->m_nextpkt; do { if (n->m_flags & M_EOR) { sbappendrecord_locked(sb, m); /* XXXXXX!!!! */ return; } } while (n->m_next && (n = n->m_next)); } else { /* * XXX Would like to simply use sb_mbtail here, but * XXX I need to verify that I won't miss an EOR that * XXX way. */ if ((n = sb->sb_lastrecord) != NULL) { do { if (n->m_flags & M_EOR) { sbappendrecord_locked(sb, m); /* XXXXXX!!!! */ return; } } while (n->m_next && (n = n->m_next)); } else { /* * If this is the first record in the socket buffer, * it's also the last record. */ sb->sb_lastrecord = m; } } sbcompress(sb, m, n); SBLASTRECORDCHK(sb); } /* * Append mbuf chain m to the last record in the socket buffer sb. The * additional space associated the mbuf chain is recorded in sb. Empty mbufs * are discarded and mbufs are compacted where possible. */ void sbappend(struct sockbuf *sb, struct mbuf *m, int flags) { SOCKBUF_LOCK(sb); sbappend_locked(sb, m, flags); SOCKBUF_UNLOCK(sb); } /* * This version of sbappend() should only be used when the caller absolutely * knows that there will never be more than one record in the socket buffer, * that is, a stream protocol (such as TCP). */ void sbappendstream_locked(struct sockbuf *sb, struct mbuf *m, int flags) { SOCKBUF_LOCK_ASSERT(sb); KASSERT(m->m_nextpkt == NULL,("sbappendstream 0")); KASSERT(sb->sb_mb == sb->sb_lastrecord,("sbappendstream 1")); SBLASTMBUFCHK(sb); /* Remove all packet headers and mbuf tags to get a pure data chain. */ m_demote(m, 1, flags & PRUS_NOTREADY ? M_NOTREADY : 0); sbcompress(sb, m, sb->sb_mbtail); sb->sb_lastrecord = sb->sb_mb; SBLASTRECORDCHK(sb); } /* * This version of sbappend() should only be used when the caller absolutely * knows that there will never be more than one record in the socket buffer, * that is, a stream protocol (such as TCP). */ void sbappendstream(struct sockbuf *sb, struct mbuf *m, int flags) { SOCKBUF_LOCK(sb); sbappendstream_locked(sb, m, flags); SOCKBUF_UNLOCK(sb); } #ifdef SOCKBUF_DEBUG void sbcheck(struct sockbuf *sb, const char *file, int line) { struct mbuf *m, *n, *fnrdy; u_long acc, ccc, mbcnt; SOCKBUF_LOCK_ASSERT(sb); acc = ccc = mbcnt = 0; fnrdy = NULL; for (m = sb->sb_mb; m; m = n) { n = m->m_nextpkt; for (; m; m = m->m_next) { if (m->m_len == 0) { printf("sb %p empty mbuf %p\n", sb, m); goto fail; } if ((m->m_flags & M_NOTREADY) && fnrdy == NULL) { if (m != sb->sb_fnrdy) { printf("sb %p: fnrdy %p != m %p\n", sb, sb->sb_fnrdy, m); goto fail; } fnrdy = m; } if (fnrdy) { if (!(m->m_flags & M_NOTAVAIL)) { printf("sb %p: fnrdy %p, m %p is avail\n", sb, sb->sb_fnrdy, m); goto fail; } } else acc += m->m_len; ccc += m->m_len; mbcnt += MSIZE; if (m->m_flags & M_EXT) /*XXX*/ /* pretty sure this is bogus */ mbcnt += m->m_ext.ext_size; } } if (acc != sb->sb_acc || ccc != sb->sb_ccc || mbcnt != sb->sb_mbcnt) { printf("acc %ld/%u ccc %ld/%u mbcnt %ld/%u\n", acc, sb->sb_acc, ccc, sb->sb_ccc, mbcnt, sb->sb_mbcnt); goto fail; } return; fail: panic("%s from %s:%u", __func__, file, line); } #endif /* * As above, except the mbuf chain begins a new record. */ void sbappendrecord_locked(struct sockbuf *sb, struct mbuf *m0) { struct mbuf *m; SOCKBUF_LOCK_ASSERT(sb); if (m0 == NULL) return; m_clrprotoflags(m0); /* * Put the first mbuf on the queue. Note this permits zero length * records. */ sballoc(sb, m0); SBLASTRECORDCHK(sb); SBLINKRECORD(sb, m0); sb->sb_mbtail = m0; m = m0->m_next; m0->m_next = 0; if (m && (m0->m_flags & M_EOR)) { m0->m_flags &= ~M_EOR; m->m_flags |= M_EOR; } /* always call sbcompress() so it can do SBLASTMBUFCHK() */ sbcompress(sb, m, m0); } /* * As above, except the mbuf chain begins a new record. */ void sbappendrecord(struct sockbuf *sb, struct mbuf *m0) { SOCKBUF_LOCK(sb); sbappendrecord_locked(sb, m0); SOCKBUF_UNLOCK(sb); } /* Helper routine that appends data, control, and address to a sockbuf. */ static int sbappendaddr_locked_internal(struct sockbuf *sb, const struct sockaddr *asa, struct mbuf *m0, struct mbuf *control, struct mbuf *ctrl_last) { struct mbuf *m, *n, *nlast; #if MSIZE <= 256 if (asa->sa_len > MLEN) return (0); #endif m = m_get(M_NOWAIT, MT_SONAME); if (m == NULL) return (0); m->m_len = asa->sa_len; bcopy(asa, mtod(m, caddr_t), asa->sa_len); if (m0) { m_clrprotoflags(m0); m_tag_delete_chain(m0, NULL); /* * Clear some persistent info from pkthdr. * We don't use m_demote(), because some netgraph consumers * expect M_PKTHDR presence. */ m0->m_pkthdr.rcvif = NULL; m0->m_pkthdr.flowid = 0; m0->m_pkthdr.csum_flags = 0; m0->m_pkthdr.fibnum = 0; m0->m_pkthdr.rsstype = 0; } if (ctrl_last) ctrl_last->m_next = m0; /* concatenate data to control */ else control = m0; m->m_next = control; for (n = m; n->m_next != NULL; n = n->m_next) sballoc(sb, n); sballoc(sb, n); nlast = n; SBLINKRECORD(sb, m); sb->sb_mbtail = nlast; SBLASTMBUFCHK(sb); SBLASTRECORDCHK(sb); return (1); } /* * Append address and data, and optionally, control (ancillary) data to the * receive queue of a socket. If present, m0 must include a packet header * with total length. Returns 0 if no space in sockbuf or insufficient * mbufs. */ int sbappendaddr_locked(struct sockbuf *sb, const struct sockaddr *asa, struct mbuf *m0, struct mbuf *control) { struct mbuf *ctrl_last; int space = asa->sa_len; SOCKBUF_LOCK_ASSERT(sb); if (m0 && (m0->m_flags & M_PKTHDR) == 0) panic("sbappendaddr_locked"); if (m0) space += m0->m_pkthdr.len; space += m_length(control, &ctrl_last); if (space > sbspace(sb)) return (0); return (sbappendaddr_locked_internal(sb, asa, m0, control, ctrl_last)); } /* * Append address and data, and optionally, control (ancillary) data to the * receive queue of a socket. If present, m0 must include a packet header * with total length. Returns 0 if insufficient mbufs. Does not validate space * on the receiving sockbuf. */ int sbappendaddr_nospacecheck_locked(struct sockbuf *sb, const struct sockaddr *asa, struct mbuf *m0, struct mbuf *control) { struct mbuf *ctrl_last; SOCKBUF_LOCK_ASSERT(sb); ctrl_last = (control == NULL) ? NULL : m_last(control); return (sbappendaddr_locked_internal(sb, asa, m0, control, ctrl_last)); } /* * Append address and data, and optionally, control (ancillary) data to the * receive queue of a socket. If present, m0 must include a packet header * with total length. Returns 0 if no space in sockbuf or insufficient * mbufs. */ int sbappendaddr(struct sockbuf *sb, const struct sockaddr *asa, struct mbuf *m0, struct mbuf *control) { int retval; SOCKBUF_LOCK(sb); retval = sbappendaddr_locked(sb, asa, m0, control); SOCKBUF_UNLOCK(sb); return (retval); } int sbappendcontrol_locked(struct sockbuf *sb, struct mbuf *m0, struct mbuf *control) { struct mbuf *m, *n, *mlast; int space; SOCKBUF_LOCK_ASSERT(sb); if (control == NULL) panic("sbappendcontrol_locked"); space = m_length(control, &n) + m_length(m0, NULL); if (space > sbspace(sb)) return (0); m_clrprotoflags(m0); n->m_next = m0; /* concatenate data to control */ SBLASTRECORDCHK(sb); for (m = control; m->m_next; m = m->m_next) sballoc(sb, m); sballoc(sb, m); mlast = m; SBLINKRECORD(sb, control); sb->sb_mbtail = mlast; SBLASTMBUFCHK(sb); SBLASTRECORDCHK(sb); return (1); } int sbappendcontrol(struct sockbuf *sb, struct mbuf *m0, struct mbuf *control) { int retval; SOCKBUF_LOCK(sb); retval = sbappendcontrol_locked(sb, m0, control); SOCKBUF_UNLOCK(sb); return (retval); } /* * Append the data in mbuf chain (m) into the socket buffer sb following mbuf * (n). If (n) is NULL, the buffer is presumed empty. * * When the data is compressed, mbufs in the chain may be handled in one of * three ways: * * (1) The mbuf may simply be dropped, if it contributes nothing (no data, no * record boundary, and no change in data type). * * (2) The mbuf may be coalesced -- i.e., data in the mbuf may be copied into * an mbuf already in the socket buffer. This can occur if an * appropriate mbuf exists, there is room, both mbufs are not marked as * not ready, and no merging of data types will occur. * * (3) The mbuf may be appended to the end of the existing mbuf chain. * * If any of the new mbufs is marked as M_EOR, mark the last mbuf appended as * end-of-record. */ void sbcompress(struct sockbuf *sb, struct mbuf *m, struct mbuf *n) { int eor = 0; struct mbuf *o; SOCKBUF_LOCK_ASSERT(sb); while (m) { eor |= m->m_flags & M_EOR; if (m->m_len == 0 && (eor == 0 || (((o = m->m_next) || (o = n)) && o->m_type == m->m_type))) { if (sb->sb_lastrecord == m) sb->sb_lastrecord = m->m_next; m = m_free(m); continue; } if (n && (n->m_flags & M_EOR) == 0 && M_WRITABLE(n) && ((sb->sb_flags & SB_NOCOALESCE) == 0) && !(m->m_flags & M_NOTREADY) && !(n->m_flags & M_NOTREADY) && m->m_len <= MCLBYTES / 4 && /* XXX: Don't copy too much */ m->m_len <= M_TRAILINGSPACE(n) && n->m_type == m->m_type) { bcopy(mtod(m, caddr_t), mtod(n, caddr_t) + n->m_len, (unsigned)m->m_len); n->m_len += m->m_len; sb->sb_ccc += m->m_len; if (sb->sb_fnrdy == NULL) sb->sb_acc += m->m_len; if (m->m_type != MT_DATA && m->m_type != MT_OOBDATA) /* XXX: Probably don't need.*/ sb->sb_ctl += m->m_len; m = m_free(m); continue; } if (n) n->m_next = m; else sb->sb_mb = m; sb->sb_mbtail = m; sballoc(sb, m); n = m; m->m_flags &= ~M_EOR; m = m->m_next; n->m_next = 0; } if (eor) { KASSERT(n != NULL, ("sbcompress: eor && n == NULL")); n->m_flags |= eor; } SBLASTMBUFCHK(sb); } /* * Free all mbufs in a sockbuf. Check that all resources are reclaimed. */ static void sbflush_internal(struct sockbuf *sb) { while (sb->sb_mbcnt) { /* * Don't call sbcut(sb, 0) if the leading mbuf is non-empty: * we would loop forever. Panic instead. */ if (sb->sb_ccc == 0 && (sb->sb_mb == NULL || sb->sb_mb->m_len)) break; m_freem(sbcut_internal(sb, (int)sb->sb_ccc)); } KASSERT(sb->sb_ccc == 0 && sb->sb_mb == 0 && sb->sb_mbcnt == 0, ("%s: ccc %u mb %p mbcnt %u", __func__, sb->sb_ccc, (void *)sb->sb_mb, sb->sb_mbcnt)); } void sbflush_locked(struct sockbuf *sb) { SOCKBUF_LOCK_ASSERT(sb); sbflush_internal(sb); } void sbflush(struct sockbuf *sb) { SOCKBUF_LOCK(sb); sbflush_locked(sb); SOCKBUF_UNLOCK(sb); } /* * Cut data from (the front of) a sockbuf. */ static struct mbuf * sbcut_internal(struct sockbuf *sb, int len) { struct mbuf *m, *next, *mfree; KASSERT(len >= 0, ("%s: len is %d but it is supposed to be >= 0", __func__, len)); KASSERT(len <= sb->sb_ccc, ("%s: len: %d is > ccc: %u", __func__, len, sb->sb_ccc)); next = (m = sb->sb_mb) ? m->m_nextpkt : 0; mfree = NULL; while (len > 0) { if (m == NULL) { KASSERT(next, ("%s: no next, len %d", __func__, len)); m = next; next = m->m_nextpkt; } if (m->m_len > len) { KASSERT(!(m->m_flags & M_NOTAVAIL), ("%s: m %p M_NOTAVAIL", __func__, m)); m->m_len -= len; m->m_data += len; sb->sb_ccc -= len; sb->sb_acc -= len; if (sb->sb_sndptroff != 0) sb->sb_sndptroff -= len; if (m->m_type != MT_DATA && m->m_type != MT_OOBDATA) sb->sb_ctl -= len; break; } len -= m->m_len; sbfree(sb, m); /* * Do not put M_NOTREADY buffers to the free list, they * are referenced from outside. */ if (m->m_flags & M_NOTREADY) m = m->m_next; else { struct mbuf *n; n = m->m_next; m->m_next = mfree; mfree = m; m = n; } } /* * Free any zero-length mbufs from the buffer. * For SOCK_DGRAM sockets such mbufs represent empty records. * XXX: For SOCK_STREAM sockets such mbufs can appear in the buffer, * when sosend_generic() needs to send only control data. */ while (m && m->m_len == 0) { struct mbuf *n; sbfree(sb, m); n = m->m_next; m->m_next = mfree; mfree = m; m = n; } if (m) { sb->sb_mb = m; m->m_nextpkt = next; } else sb->sb_mb = next; /* * First part is an inline SB_EMPTY_FIXUP(). Second part makes sure * sb_lastrecord is up-to-date if we dropped part of the last record. */ m = sb->sb_mb; if (m == NULL) { sb->sb_mbtail = NULL; sb->sb_lastrecord = NULL; } else if (m->m_nextpkt == NULL) { sb->sb_lastrecord = m; } return (mfree); } /* * Drop data from (the front of) a sockbuf. */ void sbdrop_locked(struct sockbuf *sb, int len) { SOCKBUF_LOCK_ASSERT(sb); m_freem(sbcut_internal(sb, len)); } /* * Drop data from (the front of) a sockbuf, * and return it to caller. */ struct mbuf * sbcut_locked(struct sockbuf *sb, int len) { SOCKBUF_LOCK_ASSERT(sb); return (sbcut_internal(sb, len)); } void sbdrop(struct sockbuf *sb, int len) { struct mbuf *mfree; SOCKBUF_LOCK(sb); mfree = sbcut_internal(sb, len); SOCKBUF_UNLOCK(sb); m_freem(mfree); } /* * Maintain a pointer and offset pair into the socket buffer mbuf chain to * avoid traversal of the entire socket buffer for larger offsets. */ struct mbuf * sbsndptr(struct sockbuf *sb, u_int off, u_int len, u_int *moff) { struct mbuf *m, *ret; KASSERT(sb->sb_mb != NULL, ("%s: sb_mb is NULL", __func__)); KASSERT(off + len <= sb->sb_acc, ("%s: beyond sb", __func__)); KASSERT(sb->sb_sndptroff <= sb->sb_acc, ("%s: sndptroff broken", __func__)); /* * Is off below stored offset? Happens on retransmits. * Just return, we can't help here. */ if (sb->sb_sndptroff > off) { *moff = off; return (sb->sb_mb); } /* Return closest mbuf in chain for current offset. */ *moff = off - sb->sb_sndptroff; m = ret = sb->sb_sndptr ? sb->sb_sndptr : sb->sb_mb; if (*moff == m->m_len) { *moff = 0; sb->sb_sndptroff += m->m_len; m = ret = m->m_next; KASSERT(ret->m_len > 0, ("mbuf %p in sockbuf %p chain has no valid data", ret, sb)); } /* Advance by len to be as close as possible for the next transmit. */ for (off = off - sb->sb_sndptroff + len - 1; off > 0 && m != NULL && off >= m->m_len; m = m->m_next) { sb->sb_sndptroff += m->m_len; off -= m->m_len; } if (off > 0 && m == NULL) panic("%s: sockbuf %p and mbuf %p clashing", __func__, sb, ret); sb->sb_sndptr = m; return (ret); } +struct mbuf * +sbsndptr_noadv(struct sockbuf *sb, uint32_t off, uint32_t *moff) +{ + struct mbuf *m; + + KASSERT(sb->sb_mb != NULL, ("%s: sb_mb is NULL", __func__)); + if (sb->sb_sndptr == NULL || sb->sb_sndptroff > off) { + *moff = off; + if (sb->sb_sndptr == NULL) { + sb->sb_sndptr = sb->sb_mb; + sb->sb_sndptroff = 0; + } + return (sb->sb_mb); + } else { + m = sb->sb_sndptr; + off -= sb->sb_sndptroff; + } + *moff = off; + return (m); +} + +void +sbsndptr_adv(struct sockbuf *sb, struct mbuf *mb, uint32_t len) +{ + /* + * A small copy was done, advance forward the sb_sbsndptr to cover + * it. + */ + struct mbuf *m; + + if (mb != sb->sb_sndptr) { + /* Did not copyout at the same mbuf */ + return; + } + m = mb; + while (m && (len > 0)) { + if (len >= m->m_len) { + len -= m->m_len; + if (m->m_next) { + sb->sb_sndptroff += m->m_len; + sb->sb_sndptr = m->m_next; + } + m = m->m_next; + } else { + len = 0; + } + } +} + /* * Return the first mbuf and the mbuf data offset for the provided * send offset without changing the "sb_sndptroff" field. */ struct mbuf * sbsndmbuf(struct sockbuf *sb, u_int off, u_int *moff) { struct mbuf *m; KASSERT(sb->sb_mb != NULL, ("%s: sb_mb is NULL", __func__)); /* * If the "off" is below the stored offset, which happens on * retransmits, just use "sb_mb": */ if (sb->sb_sndptr == NULL || sb->sb_sndptroff > off) { m = sb->sb_mb; } else { m = sb->sb_sndptr; off -= sb->sb_sndptroff; } while (off > 0 && m != NULL) { if (off < m->m_len) break; off -= m->m_len; m = m->m_next; } *moff = off; return (m); } /* * Drop a record off the front of a sockbuf and move the next record to the * front. */ void sbdroprecord_locked(struct sockbuf *sb) { struct mbuf *m; SOCKBUF_LOCK_ASSERT(sb); m = sb->sb_mb; if (m) { sb->sb_mb = m->m_nextpkt; do { sbfree(sb, m); m = m_free(m); } while (m); } SB_EMPTY_FIXUP(sb); } /* * Drop a record off the front of a sockbuf and move the next record to the * front. */ void sbdroprecord(struct sockbuf *sb) { SOCKBUF_LOCK(sb); sbdroprecord_locked(sb); SOCKBUF_UNLOCK(sb); } /* * Create a "control" mbuf containing the specified data with the specified * type for presentation on a socket buffer. */ struct mbuf * sbcreatecontrol(caddr_t p, int size, int type, int level) { struct cmsghdr *cp; struct mbuf *m; if (CMSG_SPACE((u_int)size) > MCLBYTES) return ((struct mbuf *) NULL); if (CMSG_SPACE((u_int)size) > MLEN) m = m_getcl(M_NOWAIT, MT_CONTROL, 0); else m = m_get(M_NOWAIT, MT_CONTROL); if (m == NULL) return ((struct mbuf *) NULL); cp = mtod(m, struct cmsghdr *); m->m_len = 0; KASSERT(CMSG_SPACE((u_int)size) <= M_TRAILINGSPACE(m), ("sbcreatecontrol: short mbuf")); /* * Don't leave the padding between the msg header and the * cmsg data and the padding after the cmsg data un-initialized. */ bzero(cp, CMSG_SPACE((u_int)size)); if (p != NULL) (void)memcpy(CMSG_DATA(cp), p, size); m->m_len = CMSG_SPACE(size); cp->cmsg_len = CMSG_LEN(size); cp->cmsg_level = level; cp->cmsg_type = type; return (m); } /* * This does the same for socket buffers that sotoxsocket does for sockets: * generate an user-format data structure describing the socket buffer. Note * that the xsockbuf structure, since it is always embedded in a socket, does * not include a self pointer nor a length. We make this entry point public * in case some other mechanism needs it. */ void sbtoxsockbuf(struct sockbuf *sb, struct xsockbuf *xsb) { xsb->sb_cc = sb->sb_ccc; xsb->sb_hiwat = sb->sb_hiwat; xsb->sb_mbcnt = sb->sb_mbcnt; xsb->sb_mcnt = sb->sb_mcnt; xsb->sb_ccnt = sb->sb_ccnt; xsb->sb_mbmax = sb->sb_mbmax; xsb->sb_lowat = sb->sb_lowat; xsb->sb_flags = sb->sb_flags; xsb->sb_timeo = sb->sb_timeo; } /* This takes the place of kern.maxsockbuf, which moved to kern.ipc. */ static int dummy; SYSCTL_INT(_kern, KERN_DUMMY, dummy, CTLFLAG_RW, &dummy, 0, ""); SYSCTL_OID(_kern_ipc, KIPC_MAXSOCKBUF, maxsockbuf, CTLTYPE_ULONG|CTLFLAG_RW, &sb_max, 0, sysctl_handle_sb_max, "LU", "Maximum socket buffer size"); SYSCTL_ULONG(_kern_ipc, KIPC_SOCKBUF_WASTE, sockbuf_waste_factor, CTLFLAG_RW, &sb_efficiency, 0, "Socket buffer size waste factor"); Index: head/sys/modules/tcp/Makefile =================================================================== --- head/sys/modules/tcp/Makefile (revision 334803) +++ head/sys/modules/tcp/Makefile (revision 334804) @@ -1,23 +1,25 @@ # # $FreeBSD$ # SYSDIR?=${SRCTOP}/sys .include "${SYSDIR}/conf/kern.opts.mk" SUBDIR= \ ${_tcp_fastpath} \ + ${_tcp_rack} \ ${_tcpmd5} \ .if ${MK_EXTRA_TCP_STACKS} != "no" || defined(ALL_MODULES) _tcp_fastpath= fastpath +_tcp_rack= rack .endif .if (${MK_INET_SUPPORT} != "no" || ${MK_INET6_SUPPORT} != "no") || \ defined(ALL_MODULES) .if ${MK_IPSEC_SUPPORT} != "no" _tcpmd5= tcpmd5 .endif .endif .include Index: head/sys/modules/tcp/rack/Makefile =================================================================== --- head/sys/modules/tcp/rack/Makefile (nonexistent) +++ head/sys/modules/tcp/rack/Makefile (revision 334804) @@ -0,0 +1,24 @@ +# +# $FreeBSD$ +# + +.PATH: ${.CURDIR}/../../../netinet/tcp_stacks + +STACKNAME= rack +KMOD= tcp_${STACKNAME} +SRCS= rack.c sack_filter.c + +SRCS+= opt_inet.h opt_inet6.h opt_ipsec.h +SRCS+= opt_tcpdebug.h +SRCS+= opt_kern_tls.h + +# +# Enable full debugging +# +#CFLAGS += -g + +CFLAGS+= -DMODNAME=${KMOD} +CFLAGS+= -DSTACKNAME=${STACKNAME} +CFLAGS+= -DSTACKALIAS=rack_18q21 + +.include Property changes on: head/sys/modules/tcp/rack/Makefile ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Index: head/sys/netinet/tcp.h =================================================================== --- head/sys/netinet/tcp.h (revision 334803) +++ head/sys/netinet/tcp.h (revision 334804) @@ -1,284 +1,340 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)tcp.h 8.1 (Berkeley) 6/10/93 * $FreeBSD$ */ #ifndef _NETINET_TCP_H_ #define _NETINET_TCP_H_ #include #include #if __BSD_VISIBLE typedef u_int32_t tcp_seq; #define tcp6_seq tcp_seq /* for KAME src sync over BSD*'s */ #define tcp6hdr tcphdr /* for KAME src sync over BSD*'s */ /* * TCP header. * Per RFC 793, September, 1981. */ struct tcphdr { u_short th_sport; /* source port */ u_short th_dport; /* destination port */ tcp_seq th_seq; /* sequence number */ tcp_seq th_ack; /* acknowledgement number */ #if BYTE_ORDER == LITTLE_ENDIAN u_char th_x2:4, /* (unused) */ th_off:4; /* data offset */ #endif #if BYTE_ORDER == BIG_ENDIAN u_char th_off:4, /* data offset */ th_x2:4; /* (unused) */ #endif u_char th_flags; #define TH_FIN 0x01 #define TH_SYN 0x02 #define TH_RST 0x04 #define TH_PUSH 0x08 #define TH_ACK 0x10 #define TH_URG 0x20 #define TH_ECE 0x40 #define TH_CWR 0x80 #define TH_FLAGS (TH_FIN|TH_SYN|TH_RST|TH_PUSH|TH_ACK|TH_URG|TH_ECE|TH_CWR) #define PRINT_TH_FLAGS "\20\1FIN\2SYN\3RST\4PUSH\5ACK\6URG\7ECE\10CWR" u_short th_win; /* window */ u_short th_sum; /* checksum */ u_short th_urp; /* urgent pointer */ }; #define TCPOPT_EOL 0 #define TCPOLEN_EOL 1 #define TCPOPT_PAD 0 /* padding after EOL */ #define TCPOLEN_PAD 1 #define TCPOPT_NOP 1 #define TCPOLEN_NOP 1 #define TCPOPT_MAXSEG 2 #define TCPOLEN_MAXSEG 4 #define TCPOPT_WINDOW 3 #define TCPOLEN_WINDOW 3 #define TCPOPT_SACK_PERMITTED 4 #define TCPOLEN_SACK_PERMITTED 2 #define TCPOPT_SACK 5 #define TCPOLEN_SACKHDR 2 #define TCPOLEN_SACK 8 /* 2*sizeof(tcp_seq) */ #define TCPOPT_TIMESTAMP 8 #define TCPOLEN_TIMESTAMP 10 #define TCPOLEN_TSTAMP_APPA (TCPOLEN_TIMESTAMP+2) /* appendix A */ #define TCPOPT_SIGNATURE 19 /* Keyed MD5: RFC 2385 */ #define TCPOLEN_SIGNATURE 18 #define TCPOPT_FAST_OPEN 34 #define TCPOLEN_FAST_OPEN_EMPTY 2 /* Miscellaneous constants */ #define MAX_SACK_BLKS 6 /* Max # SACK blocks stored at receiver side */ #define TCP_MAX_SACK 4 /* MAX # SACKs sent in any segment */ /* * The default maximum segment size (MSS) to be used for new TCP connections * when path MTU discovery is not enabled. * * RFC879 derives the default MSS from the largest datagram size hosts are * minimally required to handle directly or through IP reassembly minus the * size of the IP and TCP header. With IPv6 the minimum MTU is specified * in RFC2460. * * For IPv4 the MSS is 576 - sizeof(struct tcpiphdr) * For IPv6 the MSS is IPV6_MMTU - sizeof(struct ip6_hdr) - sizeof(struct tcphdr) * * We use explicit numerical definition here to avoid header pollution. */ #define TCP_MSS 536 #define TCP6_MSS 1220 /* * Limit the lowest MSS we accept for path MTU discovery and the TCP SYN MSS * option. Allowing low values of MSS can consume significant resources and * be used to mount a resource exhaustion attack. * Connections requesting lower MSS values will be rounded up to this value * and the IP_DF flag will be cleared to allow fragmentation along the path. * * See tcp_subr.c tcp_minmss SYSCTL declaration for more comments. Setting * it to "0" disables the minmss check. * * The default value is fine for TCP across the Internet's smallest official * link MTU (256 bytes for AX.25 packet radio). However, a connection is very * unlikely to come across such low MTU interfaces these days (anno domini 2003). */ #define TCP_MINMSS 216 #define TCP_MAXWIN 65535 /* largest value for (unscaled) window */ #define TTCP_CLIENT_SND_WND 4096 /* dflt send window for T/TCP client */ #define TCP_MAX_WINSHIFT 14 /* maximum window shift */ #define TCP_MAXBURST 4 /* maximum segments in a burst */ #define TCP_MAXHLEN (0xf<<2) /* max length of header in bytes */ #define TCP_MAXOLEN (TCP_MAXHLEN - sizeof(struct tcphdr)) /* max space left for options */ #define TCP_FASTOPEN_MIN_COOKIE_LEN 4 /* Per RFC7413 */ #define TCP_FASTOPEN_MAX_COOKIE_LEN 16 /* Per RFC7413 */ #define TCP_FASTOPEN_PSK_LEN 16 /* Same as TCP_FASTOPEN_KEY_LEN */ #endif /* __BSD_VISIBLE */ /* * User-settable options (used with setsockopt). These are discrete * values and are not masked together. Some values appear to be * bitmasks for historical reasons. */ #define TCP_NODELAY 1 /* don't delay send to coalesce packets */ #if __BSD_VISIBLE #define TCP_MAXSEG 2 /* set maximum segment size */ #define TCP_NOPUSH 4 /* don't push last block of write */ #define TCP_NOOPT 8 /* don't use TCP options */ #define TCP_MD5SIG 16 /* use MD5 digests (RFC2385) */ #define TCP_INFO 32 /* retrieve tcp_info structure */ #define TCP_LOG 34 /* configure event logging for connection */ #define TCP_LOGBUF 35 /* retrieve event log for connection */ #define TCP_LOGID 36 /* configure log ID to correlate connections */ #define TCP_LOGDUMP 37 /* dump connection log events to device */ #define TCP_LOGDUMPID 38 /* dump events from connections with same ID to device */ #define TCP_CONGESTION 64 /* get/set congestion control algorithm */ #define TCP_CCALGOOPT 65 /* get/set cc algorithm specific options */ +#define TCP_DELACK 72 /* socket option for delayed ack */ #define TCP_KEEPINIT 128 /* N, time to establish connection */ #define TCP_KEEPIDLE 256 /* L,N,X start keeplives after this period */ #define TCP_KEEPINTVL 512 /* L,N interval between keepalives */ #define TCP_KEEPCNT 1024 /* L,N number of keepalives before close */ #define TCP_FASTOPEN 1025 /* enable TFO / was created via TFO */ #define TCP_PCAP_OUT 2048 /* number of output packets to keep */ #define TCP_PCAP_IN 4096 /* number of input packets to keep */ #define TCP_FUNCTION_BLK 8192 /* Set the tcp function pointers to the specified stack */ +/* Options for Rack and BBR */ +#define TCP_RACK_PROP 1051 /* RACK proportional rate reduction (bool) */ +#define TCP_RACK_TLP_REDUCE 1052 /* RACK TLP cwnd reduction (bool) */ +#define TCP_RACK_PACE_REDUCE 1053 /* RACK Pacing reduction factor (divisor) */ +#define TCP_RACK_PACE_MAX_SEG 1054 /* Max segments in a pace */ +#define TCP_RACK_PACE_ALWAYS 1055 /* Use the always pace method */ +#define TCP_RACK_PROP_RATE 1056 /* The proportional reduction rate */ +#define TCP_RACK_PRR_SENDALOT 1057 /* Allow PRR to send more than one seg */ +#define TCP_RACK_MIN_TO 1058 /* Minimum time between rack t-o's in ms */ +#define TCP_RACK_EARLY_RECOV 1059 /* Should recovery happen early (bool) */ +#define TCP_RACK_EARLY_SEG 1060 /* If early recovery max segments */ +#define TCP_RACK_REORD_THRESH 1061 /* RACK reorder threshold (shift amount) */ +#define TCP_RACK_REORD_FADE 1062 /* Does reordering fade after ms time */ +#define TCP_RACK_TLP_THRESH 1063 /* RACK TLP theshold i.e. srtt+(srtt/N) */ +#define TCP_RACK_PKT_DELAY 1064 /* RACK added ms i.e. rack-rtt + reord + N */ +#define TCP_RACK_TLP_INC_VAR 1065 /* Does TLP include rtt variance in t-o */ +#define TCP_RACK_SESS_CWV 1066 /* Enable RFC7611 cwnd validation on sess */ +#define TCP_BBR_IWINTSO 1067 /* Initial TSO window for BBRs first sends */ +#define TCP_BBR_RECFORCE 1068 /* Enter recovery force out a segment disregard pacer */ +#define TCP_BBR_STARTUP_PG 1069 /* Startup pacing gain */ +#define TCP_BBR_DRAIN_PG 1070 /* Drain pacing gain */ +#define TCP_BBR_RWND_IS_APP 1071 /* Rwnd limited is considered app limited */ +#define TCP_BBR_PROBE_RTT_INT 1072 /* How long in useconds between probe-rtt */ +#define TCP_BBR_ONE_RETRAN 1073 /* Is only one segment allowed out during retran */ +#define TCP_BBR_STARTUP_LOSS_EXIT 1074 /* Do we exit a loss during startup if not 20% incr */ +#define TCP_BBR_USE_LOWGAIN 1075 /* lower the gain in PROBE_BW enable */ +#define TCP_BBR_LOWGAIN_THRESH 1076 /* How many cycles do we stay in lowgain */ +#define TCP_BBR_LOWGAIN_HALF 1077 /* Do we halfstep lowgain down */ +#define TCP_BBR_LOWGAIN_FD 1078 /* Do we force a drain when lowgain in place */ +#define TCP_BBR_USEDEL_RATE 1079 /* Enable use of delivery rate for loss recovery */ +#define TCP_BBR_MIN_RTO 1080 /* Min RTO in milliseconds */ +#define TCP_BBR_MAX_RTO 1081 /* Max RTO in milliseconds */ +#define TCP_BBR_REC_OVER_HPTS 1082 /* Recovery override htps settings 0/1/3 */ +#define TCP_BBR_UNLIMITED 1083 /* Does BBR, in non-recovery not use cwnd */ +#define TCP_BBR_DRAIN_INC_EXTRA 1084 /* Does the 3/4 drain target include the extra gain */ +#define TCP_BBR_STARTUP_EXIT_EPOCH 1085 /* what epoch gets us out of startup */ +#define TCP_BBR_PACE_PER_SEC 1086 +#define TCP_BBR_PACE_DEL_TAR 1087 +#define TCP_BBR_PACE_SEG_MAX 1088 +#define TCP_BBR_PACE_SEG_MIN 1089 +#define TCP_BBR_PACE_CROSS 1090 +#define TCP_RACK_IDLE_REDUCE_HIGH 1092 /* Reduce the highest cwnd seen to IW on idle */ +#define TCP_RACK_IDLE_REDUCE_HIGH 1092 /* Reduce the highest cwnd seen to IW on idle */ +#define TCP_RACK_MIN_PACE 1093 /* Do we enforce rack min pace time */ +#define TCP_RACK_MIN_PACE_SEG 1094 /* If so what is the seg threshould */ +#define TCP_RACK_TLP_USE 1095 +#define TCP_BBR_ACK_COMP_ALG 1096 /* Not used */ +#define TCP_BBR_EXTRA_GAIN 1097 +#define TCP_BBR_RACK_RTT_USE 1098 /* what RTT should we use 0, 1, or 2? */ +#define TCP_BBR_RETRAN_WTSO 1099 +#define TCP_DATA_AFTER_CLOSE 1100 +#define TCP_BBR_PROBE_RTT_GAIN 1101 +#define TCP_BBR_PROBE_RTT_LEN 1102 + + /* Start of reserved space for third-party user-settable options. */ #define TCP_VENDOR SO_VENDOR #define TCP_CA_NAME_MAX 16 /* max congestion control name length */ #define TCPI_OPT_TIMESTAMPS 0x01 #define TCPI_OPT_SACK 0x02 #define TCPI_OPT_WSCALE 0x04 #define TCPI_OPT_ECN 0x08 #define TCPI_OPT_TOE 0x10 /* Maximum length of log ID. */ #define TCP_LOG_ID_LEN 64 /* * The TCP_INFO socket option comes from the Linux 2.6 TCP API, and permits * the caller to query certain information about the state of a TCP * connection. We provide an overlapping set of fields with the Linux * implementation, but since this is a fixed size structure, room has been * left for growth. In order to maximize potential future compatibility with * the Linux API, the same variable names and order have been adopted, and * padding left to make room for omitted fields in case they are added later. * * XXX: This is currently an unstable ABI/API, in that it is expected to * change. */ struct tcp_info { u_int8_t tcpi_state; /* TCP FSM state. */ u_int8_t __tcpi_ca_state; u_int8_t __tcpi_retransmits; u_int8_t __tcpi_probes; u_int8_t __tcpi_backoff; u_int8_t tcpi_options; /* Options enabled on conn. */ u_int8_t tcpi_snd_wscale:4, /* RFC1323 send shift value. */ tcpi_rcv_wscale:4; /* RFC1323 recv shift value. */ u_int32_t tcpi_rto; /* Retransmission timeout (usec). */ u_int32_t __tcpi_ato; u_int32_t tcpi_snd_mss; /* Max segment size for send. */ u_int32_t tcpi_rcv_mss; /* Max segment size for receive. */ u_int32_t __tcpi_unacked; u_int32_t __tcpi_sacked; u_int32_t __tcpi_lost; u_int32_t __tcpi_retrans; u_int32_t __tcpi_fackets; /* Times; measurements in usecs. */ u_int32_t __tcpi_last_data_sent; u_int32_t __tcpi_last_ack_sent; /* Also unimpl. on Linux? */ u_int32_t tcpi_last_data_recv; /* Time since last recv data. */ u_int32_t __tcpi_last_ack_recv; /* Metrics; variable units. */ u_int32_t __tcpi_pmtu; u_int32_t __tcpi_rcv_ssthresh; u_int32_t tcpi_rtt; /* Smoothed RTT in usecs. */ u_int32_t tcpi_rttvar; /* RTT variance in usecs. */ u_int32_t tcpi_snd_ssthresh; /* Slow start threshold. */ u_int32_t tcpi_snd_cwnd; /* Send congestion window. */ u_int32_t __tcpi_advmss; u_int32_t __tcpi_reordering; u_int32_t __tcpi_rcv_rtt; u_int32_t tcpi_rcv_space; /* Advertised recv window. */ /* FreeBSD extensions to tcp_info. */ u_int32_t tcpi_snd_wnd; /* Advertised send window. */ u_int32_t tcpi_snd_bwnd; /* No longer used. */ u_int32_t tcpi_snd_nxt; /* Next egress seqno */ u_int32_t tcpi_rcv_nxt; /* Next ingress seqno */ u_int32_t tcpi_toe_tid; /* HWTID for TOE endpoints */ u_int32_t tcpi_snd_rexmitpack; /* Retransmitted packets */ u_int32_t tcpi_rcv_ooopack; /* Out-of-order packets */ u_int32_t tcpi_snd_zerowin; /* Zero-sized windows sent */ /* Padding to grow without breaking ABI. */ u_int32_t __tcpi_pad[26]; /* Padding. */ }; /* * If this structure is provided when setting the TCP_FASTOPEN socket * option, and the enable member is non-zero, a subsequent connect will use * pre-shared key (PSK) mode using the provided key. */ struct tcp_fastopen { int enable; uint8_t psk[TCP_FASTOPEN_PSK_LEN]; }; #endif #define TCP_FUNCTION_NAME_LEN_MAX 32 struct tcp_function_set { char function_set_name[TCP_FUNCTION_NAME_LEN_MAX]; uint32_t pcbcnt; }; #endif /* !_NETINET_TCP_H_ */ Index: head/sys/netinet/tcp_log_buf.h =================================================================== --- head/sys/netinet/tcp_log_buf.h (revision 334803) +++ head/sys/netinet/tcp_log_buf.h (revision 334804) @@ -1,368 +1,368 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2016-2018 * Netflix Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef __tcp_log_buf_h__ #define __tcp_log_buf_h__ #define TCP_LOG_REASON_LEN 32 #define TCP_LOG_BUF_VER (6) /* * Because the (struct tcp_log_buffer) includes 8-byte uint64_t's, it requires * 8-byte alignment to work properly on all platforms. Therefore, we will * enforce 8-byte alignment for all the structures that may appear by * themselves (instead of being embedded in another structure) in a data * stream. */ #define ALIGN_TCP_LOG __aligned(8) /* Information about the socketbuffer state. */ struct tcp_log_sockbuf { uint32_t tls_sb_acc; /* available chars (sb->sb_acc) */ uint32_t tls_sb_ccc; /* claimed chars (sb->sb_ccc) */ uint32_t tls_sb_spare; /* spare */ }; /* Optional, verbose information that may be appended to an event log. */ struct tcp_log_verbose { #define TCP_FUNC_LEN 32 char tlv_snd_frm[TCP_FUNC_LEN]; /* tcp_output() caller */ char tlv_trace_func[TCP_FUNC_LEN]; /* Function that generated trace */ uint32_t tlv_trace_line; /* Line number that generated trace */ uint8_t _pad[4]; } ALIGN_TCP_LOG; /* Internal RACK state variables. */ struct tcp_log_rack { uint32_t tlr_rack_rtt; /* rc_rack_rtt */ uint8_t tlr_state; /* Internal RACK state */ uint8_t _pad[3]; /* Padding */ }; struct tcp_log_bbr { uint64_t cur_del_rate; uint64_t delRate; uint64_t rttProp; uint64_t bw_inuse; uint32_t inflight; uint32_t applimited; uint32_t delivered; uint32_t timeStamp; uint32_t epoch; uint32_t lt_epoch; uint32_t pkts_out; uint32_t flex1; uint32_t flex2; uint32_t flex3; uint32_t flex4; uint32_t flex5; uint32_t flex6; uint32_t lost; uint16_t pacing_gain; uint16_t cwnd_gain; uint16_t flex7; uint8_t bbr_state; uint8_t bbr_substate; - uint8_t inpacer; + uint8_t inhpts; uint8_t ininput; uint8_t use_lt_bw; uint8_t flex8; uint32_t pkt_epoch; }; /* Per-stack stack-specific info. */ union tcp_log_stackspecific { struct tcp_log_rack u_rack; struct tcp_log_bbr u_bbr; }; struct tcp_log_buffer { /* Event basics */ struct timeval tlb_tv; /* Timestamp of trace */ uint32_t tlb_ticks; /* Timestamp of trace */ uint32_t tlb_sn; /* Serial number */ uint8_t tlb_stackid; /* Stack ID */ uint8_t tlb_eventid; /* Event ID */ uint16_t tlb_eventflags; /* Flags for the record */ #define TLB_FLAG_RXBUF 0x0001 /* Includes receive buffer info */ #define TLB_FLAG_TXBUF 0x0002 /* Includes send buffer info */ #define TLB_FLAG_HDR 0x0004 /* Includes a TCP header */ #define TLB_FLAG_VERBOSE 0x0008 /* Includes function/line numbers */ #define TLB_FLAG_STACKINFO 0x0010 /* Includes stack-specific info */ int tlb_errno; /* Event error (if any) */ /* Internal session state */ struct tcp_log_sockbuf tlb_rxbuf; /* Receive buffer */ struct tcp_log_sockbuf tlb_txbuf; /* Send buffer */ int tlb_state; /* TCPCB t_state */ uint32_t tlb_starttime; /* TCPCB t_starttime */ uint32_t tlb_iss; /* TCPCB iss */ uint32_t tlb_flags; /* TCPCB flags */ uint32_t tlb_snd_una; /* TCPCB snd_una */ uint32_t tlb_snd_max; /* TCPCB snd_max */ uint32_t tlb_snd_cwnd; /* TCPCB snd_cwnd */ uint32_t tlb_snd_nxt; /* TCPCB snd_nxt */ uint32_t tlb_snd_recover;/* TCPCB snd_recover */ uint32_t tlb_snd_wnd; /* TCPCB snd_wnd */ uint32_t tlb_snd_ssthresh; /* TCPCB snd_ssthresh */ uint32_t tlb_srtt; /* TCPCB t_srtt */ uint32_t tlb_rttvar; /* TCPCB t_rttvar */ uint32_t tlb_rcv_up; /* TCPCB rcv_up */ uint32_t tlb_rcv_adv; /* TCPCB rcv_adv */ uint32_t tlb_rcv_nxt; /* TCPCB rcv_nxt */ tcp_seq tlb_sack_newdata; /* TCPCB sack_newdata */ uint32_t tlb_rcv_wnd; /* TCPCB rcv_wnd */ uint32_t tlb_dupacks; /* TCPCB t_dupacks */ int tlb_segqlen; /* TCPCB segqlen */ int tlb_snd_numholes; /* TCPCB snd_numholes */ uint32_t tlb_flex1; /* Event specific information */ uint32_t tlb_flex2; /* Event specific information */ uint8_t tlb_snd_scale:4, /* TCPCB snd_scale */ tlb_rcv_scale:4; /* TCPCB rcv_scale */ uint8_t _pad[3]; /* Padding */ /* Per-stack info */ union tcp_log_stackspecific tlb_stackinfo; #define tlb_rack tlb_stackinfo.u_rack /* The packet */ uint32_t tlb_len; /* The packet's data length */ struct tcphdr tlb_th; /* The TCP header */ uint8_t tlb_opts[TCP_MAXOLEN]; /* The TCP options */ /* Verbose information (optional) */ struct tcp_log_verbose tlb_verbose[0]; } ALIGN_TCP_LOG; enum tcp_log_events { TCP_LOG_IN = 1, /* Incoming packet 1 */ TCP_LOG_OUT, /* Transmit (without other event) 2 */ TCP_LOG_RTO, /* Retransmit timeout 3 */ TCP_LOG_TF_ACK, /* Transmit due to TF_ACK 4 */ TCP_LOG_BAD_RETRAN, /* Detected bad retransmission 5 */ TCP_LOG_PRR, /* Doing PRR 6 */ TCP_LOG_REORDER,/* Detected reorder 7 */ TCP_LOG_PACER, /* Pacer sending a packet 8 */ BBR_LOG_BBRUPD, /* We updated BBR info 9 */ BBR_LOG_BBRSND, /* We did a slot calculation and sending is done 10 */ BBR_LOG_ACKCLEAR, /* A ack clears all outstanding 11 */ BBR_LOG_INQUEUE, /* The tcb had a packet input to it 12 */ BBR_LOG_TIMERSTAR, /* Start a timer 13 */ BBR_LOG_TIMERCANC, /* Cancel a timer 14 */ BBR_LOG_ENTREC, /* Entered recovery 15 */ BBR_LOG_EXITREC, /* Exited recovery 16 */ BBR_LOG_CWND, /* Cwnd change 17 */ BBR_LOG_BWSAMP, /* LT B/W sample has been made 18 */ BBR_LOG_MSGSIZE, /* We received a EMSGSIZE error 19 */ BBR_LOG_BBRRTT, /* BBR RTT is updated 20 */ BBR_LOG_JUSTRET, /* We just returned out of output 21 */ BBR_LOG_STATE, /* A BBR state change occured 22 */ BBR_LOG_PKT_EPOCH, /* A BBR packet epoch occured 23 */ BBR_LOG_PERSIST, /* BBR changed to/from a persists 24 */ TCP_LOG_FLOWEND, /* End of a flow 25 */ BBR_LOG_RTO, /* BBR's timeout includes BBR info 26 */ BBR_LOG_DOSEG_DONE, /* pacer do_segment completes 27 */ BBR_LOG_EXIT_GAIN, /* pacer do_segment completes 28 */ BBR_LOG_THRESH_CALC, /* Doing threshold calculation 29 */ BBR_LOG_EXTRACWNDGAIN, /* Removed 30 */ TCP_LOG_USERSEND, /* User level sends data 31 */ UNUSED_32, /* Unused 32 */ UNUSED_33, /* Unused 33 */ BBR_LOG_TIME_EPOCH, /* A timed based Epoch occured 34 */ BBR_LOG_TO_PROCESS, /* A to was processed 35 */ BBR_LOG_BBRTSO, /* TSO update 36 */ BBR_LOG_PACERDIAG, /* Pacer diag insert 37 */ BBR_LOG_LOWGAIN, /* Low gain accounting 38 */ BBR_LOG_PROGRESS, /* Progress timer event 39 */ TCP_LOG_SOCKET_OPT, /* A socket option is set 40 */ BBR_LOG_TIMERPREP, /* A BBR var to debug out TLP issues 41 */ BBR_LOG_ENOBUF_JMP, /* We had a enobuf jump 42 */ BBR_LOG_PACING_CALC, /* calc the pacing time 43 */ BBR_LOG_RTT_SHRINKS, /* We had a log reduction of rttProp 44 */ BBR_LOG_BW_RED_EV, /* B/W reduction events 45 */ BBR_LOG_REDUCE, /* old bbr log reduce for 4.1 and earlier 46*/ TCP_LOG_RTT, /* A rtt (in useconds) is being sampled and applied to the srtt algo 47 */ BBR_LOG_SETTINGS_CHG, /* Settings changed for loss response 48 */ TCP_LOG_END /* End (keep at end) 49 */ }; enum tcp_log_states { TCP_LOG_STATE_CLEAR = -1, /* Deactivate and clear tracing */ TCP_LOG_STATE_OFF = 0, /* Pause */ TCP_LOG_STATE_TAIL=1, /* Keep the trailing events */ TCP_LOG_STATE_HEAD=2, /* Keep the leading events */ TCP_LOG_STATE_HEAD_AUTO=3, /* Keep the leading events, and automatically dump them to the device */ TCP_LOG_STATE_CONTINUAL=4, /* Continually dump the data when full */ TCP_LOG_STATE_TAIL_AUTO=5, /* Keep the trailing events, and automatically dump them when the session ends */ }; /* Use this if we don't know whether the operation succeeded. */ #define ERRNO_UNK (-1) /* * If the user included dev/tcp_log/tcp_log_dev.h, then include our private * headers. Otherwise, there is no reason to pollute all the files with an * additional include. * * This structure is aligned to an 8-byte boundary to match the alignment * requirements of (struct tcp_log_buffer). */ #ifdef __tcp_log_dev_h__ struct tcp_log_header { struct tcp_log_common_header tlh_common; #define tlh_version tlh_common.tlch_version #define tlh_type tlh_common.tlch_type #define tlh_length tlh_common.tlch_length struct in_endpoints tlh_ie; struct timeval tlh_offset; /* Uptime -> UTC offset */ char tlh_id[TCP_LOG_ID_LEN]; char tlh_reason[TCP_LOG_REASON_LEN]; uint8_t tlh_af; uint8_t _pad[7]; } ALIGN_TCP_LOG; #ifdef _KERNEL struct tcp_log_dev_log_queue { struct tcp_log_dev_queue tldl_common; char tldl_id[TCP_LOG_ID_LEN]; char tldl_reason[TCP_LOG_REASON_LEN]; struct in_endpoints tldl_ie; struct tcp_log_stailq tldl_entries; int tldl_count; uint8_t tldl_af; }; #endif /* _KERNEL */ #endif /* __tcp_log_dev_h__ */ #ifdef _KERNEL #define TCP_LOG_BUF_DEFAULT_SESSION_LIMIT 10000 #define TCP_LOG_BUF_DEFAULT_GLOBAL_LIMIT 1000000 /* * TCP_LOG_EVENT_VERBOSE: The same as TCP_LOG_EVENT, except it always * tries to record verbose information. */ #define TCP_LOG_EVENT_VERBOSE(tp, th, rxbuf, txbuf, eventid, errornum, len, stackinfo, th_hostorder, tv) \ do { \ if (tp->t_logstate != TCP_LOG_STATE_OFF) \ tcp_log_event_(tp, th, rxbuf, txbuf, eventid, \ errornum, len, stackinfo, th_hostorder, \ tp->t_output_caller, __func__, __LINE__, tv); \ } while (0) /* * TCP_LOG_EVENT: This is a macro so we can capture function/line * information when needed. * * Prototype: * TCP_LOG_EVENT(struct tcpcb *tp, struct tcphdr *th, struct sockbuf *rxbuf, * struct sockbuf *txbuf, uint8_t eventid, int errornum, * union tcp_log_stackspecific *stackinfo) * * tp is mandatory and must be write locked. * th is optional; if present, it will appear in the record. * rxbuf and txbuf are optional; if present, they will appear in the record. * eventid is mandatory. * errornum is mandatory (it indicates the success or failure of the * operation associated with the event). * len indicates the length of the packet. If no packet, use 0. * stackinfo is optional; if present, it will appear in the record. */ #ifdef TCP_LOG_FORCEVERBOSE #define TCP_LOG_EVENT TCP_LOG_EVENT_VERBOSE #else #define TCP_LOG_EVENT(tp, th, rxbuf, txbuf, eventid, errornum, len, stackinfo, th_hostorder) \ do { \ if (tcp_log_verbose) \ TCP_LOG_EVENT_VERBOSE(tp, th, rxbuf, txbuf, \ eventid, errornum, len, stackinfo, \ th_hostorder, NULL); \ else if (tp->t_logstate != TCP_LOG_STATE_OFF) \ tcp_log_event_(tp, th, rxbuf, txbuf, eventid, \ errornum, len, stackinfo, th_hostorder, \ NULL, NULL, 0, NULL); \ } while (0) #endif /* TCP_LOG_FORCEVERBOSE */ #define TCP_LOG_EVENTP(tp, th, rxbuf, txbuf, eventid, errornum, len, stackinfo, th_hostorder, tv) \ do { \ if (tp->t_logstate != TCP_LOG_STATE_OFF) \ tcp_log_event_(tp, th, rxbuf, txbuf, eventid, \ errornum, len, stackinfo, th_hostorder, \ NULL, NULL, 0, tv); \ } while (0) #ifdef TCP_BLACKBOX extern bool tcp_log_verbose; void tcp_log_drain(struct tcpcb *tp); int tcp_log_dump_tp_logbuf(struct tcpcb *tp, char *reason, int how, bool force); void tcp_log_dump_tp_bucket_logbufs(struct tcpcb *tp, char *reason); struct tcp_log_buffer *tcp_log_event_(struct tcpcb *tp, struct tcphdr *th, struct sockbuf *rxbuf, struct sockbuf *txbuf, uint8_t eventid, int errornum, uint32_t len, union tcp_log_stackspecific *stackinfo, int th_hostorder, const char *output_caller, const char *func, int line, const struct timeval *tv); size_t tcp_log_get_id(struct tcpcb *tp, char *buf); u_int tcp_log_get_id_cnt(struct tcpcb *tp); int tcp_log_getlogbuf(struct sockopt *sopt, struct tcpcb *tp); void tcp_log_init(void); int tcp_log_set_id(struct tcpcb *tp, char *id); int tcp_log_state_change(struct tcpcb *tp, int state); void tcp_log_tcpcbinit(struct tcpcb *tp); void tcp_log_tcpcbfini(struct tcpcb *tp); void tcp_log_flowend(struct tcpcb *tp); #else /* !TCP_BLACKBOX */ #define tcp_log_verbose (false) static inline struct tcp_log_buffer * tcp_log_event_(struct tcpcb *tp, struct tcphdr *th, struct sockbuf *rxbuf, struct sockbuf *txbuf, uint8_t eventid, int errornum, uint32_t len, union tcp_log_stackspecific *stackinfo, int th_hostorder, const char *output_caller, const char *func, int line, const struct timeval *tv) { return (NULL); } #endif /* TCP_BLACKBOX */ #endif /* _KERNEL */ #endif /* __tcp_log_buf_h__ */ Index: head/sys/netinet/tcp_output.c =================================================================== --- head/sys/netinet/tcp_output.c (revision 334803) +++ head/sys/netinet/tcp_output.c (revision 334804) @@ -1,1911 +1,2044 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)tcp_output.c 8.4 (Berkeley) 5/24/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_tcpdebug.h" #include #include #include #ifdef TCP_HHOOK #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #include #include #endif #include #define TCPOUTFLAGS #include #include #include #include #include #include #include #include #ifdef TCPPCAP #include #endif #ifdef TCPDEBUG #include #endif #ifdef TCP_OFFLOAD #include #endif #include #include #include VNET_DEFINE(int, path_mtu_discovery) = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, path_mtu_discovery, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(path_mtu_discovery), 1, "Enable Path MTU Discovery"); VNET_DEFINE(int, tcp_do_tso) = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, tso, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_do_tso), 0, "Enable TCP Segmentation Offload"); VNET_DEFINE(int, tcp_sendspace) = 1024*32; #define V_tcp_sendspace VNET(tcp_sendspace) SYSCTL_INT(_net_inet_tcp, TCPCTL_SENDSPACE, sendspace, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_sendspace), 0, "Initial send socket buffer size"); VNET_DEFINE(int, tcp_do_autosndbuf) = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_auto, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_do_autosndbuf), 0, "Enable automatic send buffer sizing"); VNET_DEFINE(int, tcp_autosndbuf_inc) = 8*1024; SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_inc, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_autosndbuf_inc), 0, "Incrementor step size of automatic send buffer"); VNET_DEFINE(int, tcp_autosndbuf_max) = 2*1024*1024; SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_max, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_autosndbuf_max), 0, "Max size of automatic send buffer"); VNET_DEFINE(int, tcp_sendbuf_auto_lowat) = 0; #define V_tcp_sendbuf_auto_lowat VNET(tcp_sendbuf_auto_lowat) SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_auto_lowat, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_sendbuf_auto_lowat), 0, "Modify threshold for auto send buffer growth to account for SO_SNDLOWAT"); /* * Make sure that either retransmit or persist timer is set for SYN, FIN and * non-ACK. */ #define TCP_XMIT_TIMER_ASSERT(tp, len, th_flags) \ KASSERT(((len) == 0 && ((th_flags) & (TH_SYN | TH_FIN)) == 0) ||\ tcp_timer_active((tp), TT_REXMT) || \ tcp_timer_active((tp), TT_PERSIST), \ ("neither rexmt nor persist timer is set")) -#ifdef TCP_HHOOK -static void inline hhook_run_tcp_est_out(struct tcpcb *tp, - struct tcphdr *th, struct tcpopt *to, - uint32_t len, int tso); -#endif static void inline cc_after_idle(struct tcpcb *tp); #ifdef TCP_HHOOK /* * Wrapper for the TCP established output helper hook. */ -static void inline +void hhook_run_tcp_est_out(struct tcpcb *tp, struct tcphdr *th, struct tcpopt *to, uint32_t len, int tso) { struct tcp_hhook_data hhook_data; if (V_tcp_hhh[HHOOK_TCP_EST_OUT]->hhh_nhooks > 0) { hhook_data.tp = tp; hhook_data.th = th; hhook_data.to = to; hhook_data.len = len; hhook_data.tso = tso; hhook_run_hooks(V_tcp_hhh[HHOOK_TCP_EST_OUT], &hhook_data, tp->osd); } } #endif /* * CC wrapper hook functions */ static void inline cc_after_idle(struct tcpcb *tp) { INP_WLOCK_ASSERT(tp->t_inpcb); if (CC_ALGO(tp)->after_idle != NULL) CC_ALGO(tp)->after_idle(tp->ccv); } /* * Tcp output routine: figure out what should be sent and send it. */ int tcp_output(struct tcpcb *tp) { struct socket *so = tp->t_inpcb->inp_socket; int32_t len; uint32_t recwin, sendwin; int off, flags, error = 0; /* Keep compiler happy */ struct mbuf *m; struct ip *ip = NULL; #ifdef TCPDEBUG struct ipovly *ipov = NULL; #endif struct tcphdr *th; u_char opt[TCP_MAXOLEN]; unsigned ipoptlen, optlen, hdrlen; #if defined(IPSEC) || defined(IPSEC_SUPPORT) unsigned ipsec_optlen = 0; #endif int idle, sendalot, curticks; int sack_rxmit, sack_bytes_rxmt; struct sackhole *p; int tso, mtu; struct tcpopt to; unsigned int wanted_cookie = 0; unsigned int dont_sendalot = 0; #if 0 int maxburst = TCP_MAXBURST; #endif #ifdef INET6 struct ip6_hdr *ip6 = NULL; int isipv6; isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0; #endif INP_WLOCK_ASSERT(tp->t_inpcb); #ifdef TCP_OFFLOAD if (tp->t_flags & TF_TOE) return (tcp_offload_output(tp)); #endif /* * For TFO connections in SYN_RECEIVED, only allow the initial * SYN|ACK and those sent by the retransmit timer. */ if (IS_FASTOPEN(tp->t_flags) && (tp->t_state == TCPS_SYN_RECEIVED) && SEQ_GT(tp->snd_max, tp->snd_una) && /* initial SYN|ACK sent */ (tp->snd_nxt != tp->snd_una)) /* not a retransmit */ return (0); /* * Determine length of data that should be transmitted, * and flags that will be used. * If there is some data or critical controls (SYN, RST) * to send, then transmit; otherwise, investigate further. */ idle = (tp->t_flags & TF_LASTIDLE) || (tp->snd_max == tp->snd_una); if (idle && ticks - tp->t_rcvtime >= tp->t_rxtcur) cc_after_idle(tp); tp->t_flags &= ~TF_LASTIDLE; if (idle) { if (tp->t_flags & TF_MORETOCOME) { tp->t_flags |= TF_LASTIDLE; idle = 0; } } again: /* * If we've recently taken a timeout, snd_max will be greater than * snd_nxt. There may be SACK information that allows us to avoid * resending already delivered data. Adjust snd_nxt accordingly. */ if ((tp->t_flags & TF_SACK_PERMIT) && SEQ_LT(tp->snd_nxt, tp->snd_max)) tcp_sack_adjust(tp); sendalot = 0; tso = 0; mtu = 0; off = tp->snd_nxt - tp->snd_una; sendwin = min(tp->snd_wnd, tp->snd_cwnd); flags = tcp_outflags[tp->t_state]; /* * Send any SACK-generated retransmissions. If we're explicitly trying * to send out new data (when sendalot is 1), bypass this function. * If we retransmit in fast recovery mode, decrement snd_cwnd, since * we're replacing a (future) new transmission with a retransmission * now, and we previously incremented snd_cwnd in tcp_input(). */ /* * Still in sack recovery , reset rxmit flag to zero. */ sack_rxmit = 0; sack_bytes_rxmt = 0; len = 0; p = NULL; if ((tp->t_flags & TF_SACK_PERMIT) && IN_FASTRECOVERY(tp->t_flags) && (p = tcp_sack_output(tp, &sack_bytes_rxmt))) { uint32_t cwin; cwin = imax(min(tp->snd_wnd, tp->snd_cwnd) - sack_bytes_rxmt, 0); /* Do not retransmit SACK segments beyond snd_recover */ if (SEQ_GT(p->end, tp->snd_recover)) { /* * (At least) part of sack hole extends beyond * snd_recover. Check to see if we can rexmit data * for this hole. */ if (SEQ_GEQ(p->rxmit, tp->snd_recover)) { /* * Can't rexmit any more data for this hole. * That data will be rexmitted in the next * sack recovery episode, when snd_recover * moves past p->rxmit. */ p = NULL; goto after_sack_rexmit; } else /* Can rexmit part of the current hole */ len = ((int32_t)ulmin(cwin, tp->snd_recover - p->rxmit)); } else len = ((int32_t)ulmin(cwin, p->end - p->rxmit)); off = p->rxmit - tp->snd_una; KASSERT(off >= 0,("%s: sack block to the left of una : %d", __func__, off)); if (len > 0) { sack_rxmit = 1; sendalot = 1; TCPSTAT_INC(tcps_sack_rexmits); TCPSTAT_ADD(tcps_sack_rexmit_bytes, min(len, tp->t_maxseg)); } } after_sack_rexmit: /* * Get standard flags, and add SYN or FIN if requested by 'hidden' * state flags. */ if (tp->t_flags & TF_NEEDFIN) flags |= TH_FIN; if (tp->t_flags & TF_NEEDSYN) flags |= TH_SYN; SOCKBUF_LOCK(&so->so_snd); /* * If in persist timeout with window of 0, send 1 byte. * Otherwise, if window is small but nonzero * and timer expired, we will send what we can * and go to transmit state. */ if (tp->t_flags & TF_FORCEDATA) { if (sendwin == 0) { /* * If we still have some data to send, then * clear the FIN bit. Usually this would * happen below when it realizes that we * aren't sending all the data. However, * if we have exactly 1 byte of unsent data, * then it won't clear the FIN bit below, * and if we are in persist state, we wind * up sending the packet without recording * that we sent the FIN bit. * * We can't just blindly clear the FIN bit, * because if we don't have any more data * to send then the probe will be the FIN * itself. */ if (off < sbused(&so->so_snd)) flags &= ~TH_FIN; sendwin = 1; } else { tcp_timer_activate(tp, TT_PERSIST, 0); tp->t_rxtshift = 0; } } /* * If snd_nxt == snd_max and we have transmitted a FIN, the * offset will be > 0 even if so_snd.sb_cc is 0, resulting in * a negative length. This can also occur when TCP opens up * its congestion window while receiving additional duplicate * acks after fast-retransmit because TCP will reset snd_nxt * to snd_max after the fast-retransmit. * * In the normal retransmit-FIN-only case, however, snd_nxt will * be set to snd_una, the offset will be 0, and the length may * wind up 0. * * If sack_rxmit is true we are retransmitting from the scoreboard * in which case len is already set. */ if (sack_rxmit == 0) { if (sack_bytes_rxmt == 0) len = ((int32_t)min(sbavail(&so->so_snd), sendwin) - off); else { int32_t cwin; /* * We are inside of a SACK recovery episode and are * sending new data, having retransmitted all the * data possible in the scoreboard. */ len = ((int32_t)min(sbavail(&so->so_snd), tp->snd_wnd) - off); /* * Don't remove this (len > 0) check ! * We explicitly check for len > 0 here (although it * isn't really necessary), to work around a gcc * optimization issue - to force gcc to compute * len above. Without this check, the computation * of len is bungled by the optimizer. */ if (len > 0) { cwin = tp->snd_cwnd - (tp->snd_nxt - tp->sack_newdata) - sack_bytes_rxmt; if (cwin < 0) cwin = 0; len = imin(len, cwin); } } } /* * Lop off SYN bit if it has already been sent. However, if this * is SYN-SENT state and if segment contains data and if we don't * know that foreign host supports TAO, suppress sending segment. */ if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una)) { if (tp->t_state != TCPS_SYN_RECEIVED) flags &= ~TH_SYN; /* * When sending additional segments following a TFO SYN|ACK, * do not include the SYN bit. */ if (IS_FASTOPEN(tp->t_flags) && (tp->t_state == TCPS_SYN_RECEIVED)) flags &= ~TH_SYN; off--, len++; } /* * Be careful not to send data and/or FIN on SYN segments. * This measure is needed to prevent interoperability problems * with not fully conformant TCP implementations. */ if ((flags & TH_SYN) && (tp->t_flags & TF_NOOPT)) { len = 0; flags &= ~TH_FIN; } /* * On TFO sockets, ensure no data is sent in the following cases: * * - When retransmitting SYN|ACK on a passively-created socket * * - When retransmitting SYN on an actively created socket * * - When sending a zero-length cookie (cookie request) on an * actively created socket * * - When the socket is in the CLOSED state (RST is being sent) */ if (IS_FASTOPEN(tp->t_flags) && (((flags & TH_SYN) && (tp->t_rxtshift > 0)) || ((tp->t_state == TCPS_SYN_SENT) && (tp->t_tfo_client_cookie_len == 0)) || (flags & TH_RST))) len = 0; if (len <= 0) { /* * If FIN has been sent but not acked, * but we haven't been called to retransmit, * len will be < 0. Otherwise, window shrank * after we sent into it. If window shrank to 0, * cancel pending retransmit, pull snd_nxt back * to (closed) window, and set the persist timer * if it isn't already going. If the window didn't * close completely, just wait for an ACK. * * We also do a general check here to ensure that * we will set the persist timer when we have data * to send, but a 0-byte window. This makes sure * the persist timer is set even if the packet * hits one of the "goto send" lines below. */ len = 0; if ((sendwin == 0) && (TCPS_HAVEESTABLISHED(tp->t_state)) && (off < (int) sbavail(&so->so_snd))) { tcp_timer_activate(tp, TT_REXMT, 0); tp->t_rxtshift = 0; tp->snd_nxt = tp->snd_una; if (!tcp_timer_active(tp, TT_PERSIST)) tcp_setpersist(tp); } } /* len will be >= 0 after this point. */ KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__)); tcp_sndbuf_autoscale(tp, so, sendwin); /* * Decide if we can use TCP Segmentation Offloading (if supported by * hardware). * * TSO may only be used if we are in a pure bulk sending state. The * presence of TCP-MD5, SACK retransmits, SACK advertizements and * IP options prevent using TSO. With TSO the TCP header is the same * (except for the sequence number) for all generated packets. This * makes it impossible to transmit any options which vary per generated * segment or packet. * * IPv4 handling has a clear separation of ip options and ip header * flags while IPv6 combines both in in6p_outputopts. ip6_optlen() does * the right thing below to provide length of just ip options and thus * checking for ipoptlen is enough to decide if ip options are present. */ #if defined(IPSEC) || defined(IPSEC_SUPPORT) /* * Pre-calculate here as we save another lookup into the darknesses * of IPsec that way and can actually decide if TSO is ok. */ #ifdef INET6 if (isipv6 && IPSEC_ENABLED(ipv6)) ipsec_optlen = IPSEC_HDRSIZE(ipv6, tp->t_inpcb); #ifdef INET else #endif #endif /* INET6 */ #ifdef INET if (IPSEC_ENABLED(ipv4)) ipsec_optlen = IPSEC_HDRSIZE(ipv4, tp->t_inpcb); #endif /* INET */ #endif /* IPSEC */ #ifdef INET6 if (isipv6) ipoptlen = ip6_optlen(tp->t_inpcb); else #endif if (tp->t_inpcb->inp_options) ipoptlen = tp->t_inpcb->inp_options->m_len - offsetof(struct ipoption, ipopt_list); else ipoptlen = 0; #if defined(IPSEC) || defined(IPSEC_SUPPORT) ipoptlen += ipsec_optlen; #endif if ((tp->t_flags & TF_TSO) && V_tcp_do_tso && len > tp->t_maxseg && ((tp->t_flags & TF_SIGNATURE) == 0) && tp->rcv_numsacks == 0 && sack_rxmit == 0 && ipoptlen == 0 && !(flags & TH_SYN)) tso = 1; if (sack_rxmit) { if (SEQ_LT(p->rxmit + len, tp->snd_una + sbused(&so->so_snd))) flags &= ~TH_FIN; } else { if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + sbused(&so->so_snd))) flags &= ~TH_FIN; } recwin = lmin(lmax(sbspace(&so->so_rcv), 0), (long)TCP_MAXWIN << tp->rcv_scale); /* * Sender silly window avoidance. We transmit under the following * conditions when len is non-zero: * * - We have a full segment (or more with TSO) * - This is the last buffer in a write()/send() and we are * either idle or running NODELAY * - we've timed out (e.g. persist timer) * - we have more then 1/2 the maximum send window's worth of * data (receiver may be limited the window size) * - we need to retransmit */ if (len) { if (len >= tp->t_maxseg) goto send; /* * NOTE! on localhost connections an 'ack' from the remote * end may occur synchronously with the output and cause * us to flush a buffer queued with moretocome. XXX * * note: the len + off check is almost certainly unnecessary. */ if (!(tp->t_flags & TF_MORETOCOME) && /* normal case */ (idle || (tp->t_flags & TF_NODELAY)) && (uint32_t)len + (uint32_t)off >= sbavail(&so->so_snd) && (tp->t_flags & TF_NOPUSH) == 0) { goto send; } if (tp->t_flags & TF_FORCEDATA) /* typ. timeout case */ goto send; if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0) goto send; if (SEQ_LT(tp->snd_nxt, tp->snd_max)) /* retransmit case */ goto send; if (sack_rxmit) goto send; } /* * Sending of standalone window updates. * * Window updates are important when we close our window due to a * full socket buffer and are opening it again after the application * reads data from it. Once the window has opened again and the * remote end starts to send again the ACK clock takes over and * provides the most current window information. * * We must avoid the silly window syndrome whereas every read * from the receive buffer, no matter how small, causes a window * update to be sent. We also should avoid sending a flurry of * window updates when the socket buffer had queued a lot of data * and the application is doing small reads. * * Prevent a flurry of pointless window updates by only sending * an update when we can increase the advertized window by more * than 1/4th of the socket buffer capacity. When the buffer is * getting full or is very small be more aggressive and send an * update whenever we can increase by two mss sized segments. * In all other situations the ACK's to new incoming data will * carry further window increases. * * Don't send an independent window update if a delayed * ACK is pending (it will get piggy-backed on it) or the * remote side already has done a half-close and won't send * more data. Skip this if the connection is in T/TCP * half-open state. */ if (recwin > 0 && !(tp->t_flags & TF_NEEDSYN) && !(tp->t_flags & TF_DELACK) && !TCPS_HAVERCVDFIN(tp->t_state)) { /* * "adv" is the amount we could increase the window, * taking into account that we are limited by * TCP_MAXWIN << tp->rcv_scale. */ int32_t adv; int oldwin; adv = recwin; if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt)) { oldwin = (tp->rcv_adv - tp->rcv_nxt); adv -= oldwin; } else oldwin = 0; /* * If the new window size ends up being the same as or less * than the old size when it is scaled, then don't force * a window update. */ if (oldwin >> tp->rcv_scale >= (adv + oldwin) >> tp->rcv_scale) goto dontupdate; if (adv >= (int32_t)(2 * tp->t_maxseg) && (adv >= (int32_t)(so->so_rcv.sb_hiwat / 4) || recwin <= (so->so_rcv.sb_hiwat / 8) || so->so_rcv.sb_hiwat <= 8 * tp->t_maxseg)) goto send; if (2 * adv >= (int32_t)so->so_rcv.sb_hiwat) goto send; } dontupdate: /* * Send if we owe the peer an ACK, RST, SYN, or urgent data. ACKNOW * is also a catch-all for the retransmit timer timeout case. */ if (tp->t_flags & TF_ACKNOW) goto send; if ((flags & TH_RST) || ((flags & TH_SYN) && (tp->t_flags & TF_NEEDSYN) == 0)) goto send; if (SEQ_GT(tp->snd_up, tp->snd_una)) goto send; /* * If our state indicates that FIN should be sent * and we have not yet done so, then we need to send. */ if (flags & TH_FIN && ((tp->t_flags & TF_SENTFIN) == 0 || tp->snd_nxt == tp->snd_una)) goto send; /* * In SACK, it is possible for tcp_output to fail to send a segment * after the retransmission timer has been turned off. Make sure * that the retransmission timer is set. */ if ((tp->t_flags & TF_SACK_PERMIT) && SEQ_GT(tp->snd_max, tp->snd_una) && !tcp_timer_active(tp, TT_REXMT) && !tcp_timer_active(tp, TT_PERSIST)) { tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur); goto just_return; } /* * TCP window updates are not reliable, rather a polling protocol * using ``persist'' packets is used to insure receipt of window * updates. The three ``states'' for the output side are: * idle not doing retransmits or persists * persisting to move a small or zero window * (re)transmitting and thereby not persisting * * tcp_timer_active(tp, TT_PERSIST) * is true when we are in persist state. * (tp->t_flags & TF_FORCEDATA) * is set when we are called to send a persist packet. * tcp_timer_active(tp, TT_REXMT) * is set when we are retransmitting * The output side is idle when both timers are zero. * * If send window is too small, there is data to transmit, and no * retransmit or persist is pending, then go to persist state. * If nothing happens soon, send when timer expires: * if window is nonzero, transmit what we can, * otherwise force out a byte. */ if (sbavail(&so->so_snd) && !tcp_timer_active(tp, TT_REXMT) && !tcp_timer_active(tp, TT_PERSIST)) { tp->t_rxtshift = 0; tcp_setpersist(tp); } /* * No reason to send a segment, just return. */ just_return: SOCKBUF_UNLOCK(&so->so_snd); return (0); send: SOCKBUF_LOCK_ASSERT(&so->so_snd); if (len > 0) { if (len >= tp->t_maxseg) tp->t_flags2 |= TF2_PLPMTU_MAXSEGSNT; else tp->t_flags2 &= ~TF2_PLPMTU_MAXSEGSNT; } /* * Before ESTABLISHED, force sending of initial options * unless TCP set not to do any options. * NOTE: we assume that the IP/TCP header plus TCP options * always fit in a single mbuf, leaving room for a maximum * link header, i.e. * max_linkhdr + sizeof (struct tcpiphdr) + optlen <= MCLBYTES */ optlen = 0; #ifdef INET6 if (isipv6) hdrlen = sizeof (struct ip6_hdr) + sizeof (struct tcphdr); else #endif hdrlen = sizeof (struct tcpiphdr); /* * Compute options for segment. * We only have to care about SYN and established connection * segments. Options for SYN-ACK segments are handled in TCP * syncache. */ to.to_flags = 0; if ((tp->t_flags & TF_NOOPT) == 0) { /* Maximum segment size. */ if (flags & TH_SYN) { tp->snd_nxt = tp->iss; to.to_mss = tcp_mssopt(&tp->t_inpcb->inp_inc); to.to_flags |= TOF_MSS; /* * On SYN or SYN|ACK transmits on TFO connections, * only include the TFO option if it is not a * retransmit, as the presence of the TFO option may * have caused the original SYN or SYN|ACK to have * been dropped by a middlebox. */ if (IS_FASTOPEN(tp->t_flags) && (tp->t_rxtshift == 0)) { if (tp->t_state == TCPS_SYN_RECEIVED) { to.to_tfo_len = TCP_FASTOPEN_COOKIE_LEN; to.to_tfo_cookie = (u_int8_t *)&tp->t_tfo_cookie.server; to.to_flags |= TOF_FASTOPEN; wanted_cookie = 1; } else if (tp->t_state == TCPS_SYN_SENT) { to.to_tfo_len = tp->t_tfo_client_cookie_len; to.to_tfo_cookie = tp->t_tfo_cookie.client; to.to_flags |= TOF_FASTOPEN; wanted_cookie = 1; /* * If we wind up having more data to * send with the SYN than can fit in * one segment, don't send any more * until the SYN|ACK comes back from * the other end. */ dont_sendalot = 1; } } } /* Window scaling. */ if ((flags & TH_SYN) && (tp->t_flags & TF_REQ_SCALE)) { to.to_wscale = tp->request_r_scale; to.to_flags |= TOF_SCALE; } /* Timestamps. */ if ((tp->t_flags & TF_RCVD_TSTMP) || ((flags & TH_SYN) && (tp->t_flags & TF_REQ_TSTMP))) { curticks = tcp_ts_getticks(); to.to_tsval = curticks + tp->ts_offset; to.to_tsecr = tp->ts_recent; to.to_flags |= TOF_TS; if (tp->t_rxtshift == 1) tp->t_badrxtwin = curticks; } /* Set receive buffer autosizing timestamp. */ if (tp->rfbuf_ts == 0 && (so->so_rcv.sb_flags & SB_AUTOSIZE)) tp->rfbuf_ts = tcp_ts_getticks(); /* Selective ACK's. */ if (tp->t_flags & TF_SACK_PERMIT) { if (flags & TH_SYN) to.to_flags |= TOF_SACKPERM; else if (TCPS_HAVEESTABLISHED(tp->t_state) && (tp->t_flags & TF_SACK_PERMIT) && tp->rcv_numsacks > 0) { to.to_flags |= TOF_SACK; to.to_nsacks = tp->rcv_numsacks; to.to_sacks = (u_char *)tp->sackblks; } } #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) /* TCP-MD5 (RFC2385). */ /* * Check that TCP_MD5SIG is enabled in tcpcb to * account the size needed to set this TCP option. */ if (tp->t_flags & TF_SIGNATURE) to.to_flags |= TOF_SIGNATURE; #endif /* TCP_SIGNATURE */ /* Processing the options. */ hdrlen += optlen = tcp_addoptions(&to, opt); /* * If we wanted a TFO option to be added, but it was unable * to fit, ensure no data is sent. */ if (IS_FASTOPEN(tp->t_flags) && wanted_cookie && !(to.to_flags & TOF_FASTOPEN)) len = 0; } /* * Adjust data length if insertion of options will * bump the packet length beyond the t_maxseg length. * Clear the FIN bit because we cut off the tail of * the segment. */ if (len + optlen + ipoptlen > tp->t_maxseg) { flags &= ~TH_FIN; if (tso) { u_int if_hw_tsomax; u_int if_hw_tsomaxsegcount; u_int if_hw_tsomaxsegsize; struct mbuf *mb; u_int moff; int max_len; /* extract TSO information */ if_hw_tsomax = tp->t_tsomax; if_hw_tsomaxsegcount = tp->t_tsomaxsegcount; if_hw_tsomaxsegsize = tp->t_tsomaxsegsize; /* * Limit a TSO burst to prevent it from * overflowing or exceeding the maximum length * allowed by the network interface: */ KASSERT(ipoptlen == 0, ("%s: TSO can't do IP options", __func__)); /* * Check if we should limit by maximum payload * length: */ if (if_hw_tsomax != 0) { /* compute maximum TSO length */ max_len = (if_hw_tsomax - hdrlen - max_linkhdr); if (max_len <= 0) { len = 0; } else if (len > max_len) { sendalot = 1; len = max_len; } } /* * Check if we should limit by maximum segment * size and count: */ if (if_hw_tsomaxsegcount != 0 && if_hw_tsomaxsegsize != 0) { /* * Subtract one segment for the LINK * and TCP/IP headers mbuf that will * be prepended to this mbuf chain * after the code in this section * limits the number of mbufs in the * chain to if_hw_tsomaxsegcount. */ if_hw_tsomaxsegcount -= 1; max_len = 0; mb = sbsndmbuf(&so->so_snd, off, &moff); while (mb != NULL && max_len < len) { u_int mlen; u_int frags; /* * Get length of mbuf fragment * and how many hardware frags, * rounded up, it would use: */ mlen = (mb->m_len - moff); frags = howmany(mlen, if_hw_tsomaxsegsize); /* Handle special case: Zero Length Mbuf */ if (frags == 0) frags = 1; /* * Check if the fragment limit * will be reached or exceeded: */ if (frags >= if_hw_tsomaxsegcount) { max_len += min(mlen, if_hw_tsomaxsegcount * if_hw_tsomaxsegsize); break; } max_len += mlen; if_hw_tsomaxsegcount -= frags; moff = 0; mb = mb->m_next; } if (max_len <= 0) { len = 0; } else if (len > max_len) { sendalot = 1; len = max_len; } } /* * Prevent the last segment from being * fractional unless the send sockbuf can be * emptied: */ max_len = (tp->t_maxseg - optlen); if (((uint32_t)off + (uint32_t)len) < sbavail(&so->so_snd)) { moff = len % max_len; if (moff != 0) { len -= moff; sendalot = 1; } } /* * In case there are too many small fragments * don't use TSO: */ if (len <= max_len) { len = max_len; sendalot = 1; tso = 0; } /* * Send the FIN in a separate segment * after the bulk sending is done. * We don't trust the TSO implementations * to clear the FIN flag on all but the * last segment. */ if (tp->t_flags & TF_NEEDFIN) sendalot = 1; } else { len = tp->t_maxseg - optlen - ipoptlen; sendalot = 1; if (dont_sendalot) sendalot = 0; } } else tso = 0; KASSERT(len + hdrlen + ipoptlen <= IP_MAXPACKET, ("%s: len > IP_MAXPACKET", __func__)); /*#ifdef DIAGNOSTIC*/ #ifdef INET6 if (max_linkhdr + hdrlen > MCLBYTES) #else if (max_linkhdr + hdrlen > MHLEN) #endif panic("tcphdr too big"); /*#endif*/ /* * This KASSERT is here to catch edge cases at a well defined place. * Before, those had triggered (random) panic conditions further down. */ KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__)); /* * Grab a header mbuf, attaching a copy of data to * be transmitted, and initialize the header from * the template for sends on this connection. */ if (len) { struct mbuf *mb; u_int moff; if ((tp->t_flags & TF_FORCEDATA) && len == 1) TCPSTAT_INC(tcps_sndprobe); else if (SEQ_LT(tp->snd_nxt, tp->snd_max) || sack_rxmit) { tp->t_sndrexmitpack++; TCPSTAT_INC(tcps_sndrexmitpack); TCPSTAT_ADD(tcps_sndrexmitbyte, len); } else { TCPSTAT_INC(tcps_sndpack); TCPSTAT_ADD(tcps_sndbyte, len); } #ifdef INET6 if (MHLEN < hdrlen + max_linkhdr) m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); else #endif m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) { SOCKBUF_UNLOCK(&so->so_snd); error = ENOBUFS; sack_rxmit = 0; goto out; } m->m_data += max_linkhdr; m->m_len = hdrlen; /* * Start the m_copy functions from the closest mbuf * to the offset in the socket buffer chain. */ mb = sbsndptr(&so->so_snd, off, len, &moff); if (len <= MHLEN - hdrlen - max_linkhdr) { m_copydata(mb, moff, len, mtod(m, caddr_t) + hdrlen); m->m_len += len; } else { m->m_next = m_copym(mb, moff, len, M_NOWAIT); if (m->m_next == NULL) { SOCKBUF_UNLOCK(&so->so_snd); (void) m_free(m); error = ENOBUFS; sack_rxmit = 0; goto out; } } /* * If we're sending everything we've got, set PUSH. * (This will keep happy those implementations which only * give data to the user when a buffer fills or * a PUSH comes in.) */ if (((uint32_t)off + (uint32_t)len == sbused(&so->so_snd)) && !(flags & TH_SYN)) flags |= TH_PUSH; SOCKBUF_UNLOCK(&so->so_snd); } else { SOCKBUF_UNLOCK(&so->so_snd); if (tp->t_flags & TF_ACKNOW) TCPSTAT_INC(tcps_sndacks); else if (flags & (TH_SYN|TH_FIN|TH_RST)) TCPSTAT_INC(tcps_sndctrl); else if (SEQ_GT(tp->snd_up, tp->snd_una)) TCPSTAT_INC(tcps_sndurg); else TCPSTAT_INC(tcps_sndwinup); m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) { error = ENOBUFS; sack_rxmit = 0; goto out; } #ifdef INET6 if (isipv6 && (MHLEN < hdrlen + max_linkhdr) && MHLEN >= hdrlen) { M_ALIGN(m, hdrlen); } else #endif m->m_data += max_linkhdr; m->m_len = hdrlen; } SOCKBUF_UNLOCK_ASSERT(&so->so_snd); m->m_pkthdr.rcvif = (struct ifnet *)0; #ifdef MAC mac_inpcb_create_mbuf(tp->t_inpcb, m); #endif #ifdef INET6 if (isipv6) { ip6 = mtod(m, struct ip6_hdr *); th = (struct tcphdr *)(ip6 + 1); tcpip_fillheaders(tp->t_inpcb, ip6, th); } else #endif /* INET6 */ { ip = mtod(m, struct ip *); #ifdef TCPDEBUG ipov = (struct ipovly *)ip; #endif th = (struct tcphdr *)(ip + 1); tcpip_fillheaders(tp->t_inpcb, ip, th); } /* * Fill in fields, remembering maximum advertised * window for use in delaying messages about window sizes. * If resending a FIN, be sure not to use a new sequence number. */ if (flags & TH_FIN && tp->t_flags & TF_SENTFIN && tp->snd_nxt == tp->snd_max) tp->snd_nxt--; /* * If we are starting a connection, send ECN setup * SYN packet. If we are on a retransmit, we may * resend those bits a number of times as per * RFC 3168. */ if (tp->t_state == TCPS_SYN_SENT && V_tcp_do_ecn == 1) { if (tp->t_rxtshift >= 1) { if (tp->t_rxtshift <= V_tcp_ecn_maxretries) flags |= TH_ECE|TH_CWR; } else flags |= TH_ECE|TH_CWR; } if (tp->t_state == TCPS_ESTABLISHED && (tp->t_flags & TF_ECN_PERMIT)) { /* * If the peer has ECN, mark data packets with * ECN capable transmission (ECT). * Ignore pure ack packets, retransmissions and window probes. */ if (len > 0 && SEQ_GEQ(tp->snd_nxt, tp->snd_max) && !((tp->t_flags & TF_FORCEDATA) && len == 1)) { #ifdef INET6 if (isipv6) ip6->ip6_flow |= htonl(IPTOS_ECN_ECT0 << 20); else #endif ip->ip_tos |= IPTOS_ECN_ECT0; TCPSTAT_INC(tcps_ecn_ect0); } /* * Reply with proper ECN notifications. */ if (tp->t_flags & TF_ECN_SND_CWR) { flags |= TH_CWR; tp->t_flags &= ~TF_ECN_SND_CWR; } if (tp->t_flags & TF_ECN_SND_ECE) flags |= TH_ECE; } /* * If we are doing retransmissions, then snd_nxt will * not reflect the first unsent octet. For ACK only * packets, we do not want the sequence number of the * retransmitted packet, we want the sequence number * of the next unsent octet. So, if there is no data * (and no SYN or FIN), use snd_max instead of snd_nxt * when filling in ti_seq. But if we are in persist * state, snd_max might reflect one byte beyond the * right edge of the window, so use snd_nxt in that * case, since we know we aren't doing a retransmission. * (retransmit and persist are mutually exclusive...) */ if (sack_rxmit == 0) { if (len || (flags & (TH_SYN|TH_FIN)) || tcp_timer_active(tp, TT_PERSIST)) th->th_seq = htonl(tp->snd_nxt); else th->th_seq = htonl(tp->snd_max); } else { th->th_seq = htonl(p->rxmit); p->rxmit += len; tp->sackhint.sack_bytes_rexmit += len; } th->th_ack = htonl(tp->rcv_nxt); if (optlen) { bcopy(opt, th + 1, optlen); th->th_off = (sizeof (struct tcphdr) + optlen) >> 2; } th->th_flags = flags; /* * Calculate receive window. Don't shrink window, * but avoid silly window syndrome. */ if (recwin < (so->so_rcv.sb_hiwat / 4) && recwin < tp->t_maxseg) recwin = 0; if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt) && recwin < (tp->rcv_adv - tp->rcv_nxt)) recwin = (tp->rcv_adv - tp->rcv_nxt); /* * According to RFC1323 the window field in a SYN (i.e., a * or ) segment itself is never scaled. The * case is handled in syncache. */ if (flags & TH_SYN) th->th_win = htons((u_short) (min(sbspace(&so->so_rcv), TCP_MAXWIN))); else th->th_win = htons((u_short)(recwin >> tp->rcv_scale)); /* * Adjust the RXWIN0SENT flag - indicate that we have advertised * a 0 window. This may cause the remote transmitter to stall. This * flag tells soreceive() to disable delayed acknowledgements when * draining the buffer. This can occur if the receiver is attempting * to read more data than can be buffered prior to transmitting on * the connection. */ if (th->th_win == 0) { tp->t_sndzerowin++; tp->t_flags |= TF_RXWIN0SENT; } else tp->t_flags &= ~TF_RXWIN0SENT; if (SEQ_GT(tp->snd_up, tp->snd_nxt)) { th->th_urp = htons((u_short)(tp->snd_up - tp->snd_nxt)); th->th_flags |= TH_URG; } else /* * If no urgent pointer to send, then we pull * the urgent pointer to the left edge of the send window * so that it doesn't drift into the send window on sequence * number wraparound. */ tp->snd_up = tp->snd_una; /* drag it along */ /* * Put TCP length in extended header, and then * checksum extended header and data. */ m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */ m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) if (to.to_flags & TOF_SIGNATURE) { /* * Calculate MD5 signature and put it into the place * determined before. * NOTE: since TCP options buffer doesn't point into * mbuf's data, calculate offset and use it. */ if (!TCPMD5_ENABLED() || (error = TCPMD5_OUTPUT(m, th, (u_char *)(th + 1) + (to.to_signature - opt))) != 0) { /* * Do not send segment if the calculation of MD5 * digest has failed. */ m_freem(m); goto out; } } #endif #ifdef INET6 if (isipv6) { /* * There is no need to fill in ip6_plen right now. * It will be filled later by ip6_output. */ m->m_pkthdr.csum_flags = CSUM_TCP_IPV6; th->th_sum = in6_cksum_pseudo(ip6, sizeof(struct tcphdr) + optlen + len, IPPROTO_TCP, 0); } #endif #if defined(INET6) && defined(INET) else #endif #ifdef INET { m->m_pkthdr.csum_flags = CSUM_TCP; th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) + IPPROTO_TCP + len + optlen)); /* IP version must be set here for ipv4/ipv6 checking later */ KASSERT(ip->ip_v == IPVERSION, ("%s: IP version incorrect: %d", __func__, ip->ip_v)); } #endif /* * Enable TSO and specify the size of the segments. * The TCP pseudo header checksum is always provided. */ if (tso) { KASSERT(len > tp->t_maxseg - optlen, ("%s: len <= tso_segsz", __func__)); m->m_pkthdr.csum_flags |= CSUM_TSO; m->m_pkthdr.tso_segsz = tp->t_maxseg - optlen; } #if defined(IPSEC) || defined(IPSEC_SUPPORT) KASSERT(len + hdrlen + ipoptlen - ipsec_optlen == m_length(m, NULL), ("%s: mbuf chain shorter than expected: %d + %u + %u - %u != %u", __func__, len, hdrlen, ipoptlen, ipsec_optlen, m_length(m, NULL))); #else KASSERT(len + hdrlen + ipoptlen == m_length(m, NULL), ("%s: mbuf chain shorter than expected: %d + %u + %u != %u", __func__, len, hdrlen, ipoptlen, m_length(m, NULL))); #endif #ifdef TCP_HHOOK /* Run HHOOK_TCP_ESTABLISHED_OUT helper hooks. */ hhook_run_tcp_est_out(tp, th, &to, len, tso); #endif #ifdef TCPDEBUG /* * Trace. */ if (so->so_options & SO_DEBUG) { u_short save = 0; #ifdef INET6 if (!isipv6) #endif { save = ipov->ih_len; ipov->ih_len = htons(m->m_pkthdr.len /* - hdrlen + (th->th_off << 2) */); } tcp_trace(TA_OUTPUT, tp->t_state, tp, mtod(m, void *), th, 0); #ifdef INET6 if (!isipv6) #endif ipov->ih_len = save; } #endif /* TCPDEBUG */ TCP_PROBE3(debug__output, tp, th, m); /* We're getting ready to send; log now. */ TCP_LOG_EVENT(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_OUT, ERRNO_UNK, len, NULL, false); /* * Fill in IP length and desired time to live and * send to IP level. There should be a better way * to handle ttl and tos; we could keep them in * the template, but need a way to checksum without them. */ /* * m->m_pkthdr.len should have been set before checksum calculation, * because in6_cksum() need it. */ #ifdef INET6 if (isipv6) { /* * we separately set hoplimit for every segment, since the * user might want to change the value via setsockopt. * Also, desired default hop limit might be changed via * Neighbor Discovery. */ ip6->ip6_hlim = in6_selecthlim(tp->t_inpcb, NULL); /* * Set the packet size here for the benefit of DTrace probes. * ip6_output() will set it properly; it's supposed to include * the option header lengths as well. */ ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(*ip6)); if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) tp->t_flags2 |= TF2_PLPMTU_PMTUD; else tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; if (tp->t_state == TCPS_SYN_SENT) TCP_PROBE5(connect__request, NULL, tp, ip6, tp, th); TCP_PROBE5(send, NULL, tp, ip6, tp, th); #ifdef TCPPCAP /* Save packet, if requested. */ tcp_pcap_add(th, m, &(tp->t_outpkts)); #endif /* TODO: IPv6 IP6TOS_ECT bit on */ error = ip6_output(m, tp->t_inpcb->in6p_outputopts, &tp->t_inpcb->inp_route6, ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0), NULL, NULL, tp->t_inpcb); if (error == EMSGSIZE && tp->t_inpcb->inp_route6.ro_rt != NULL) mtu = tp->t_inpcb->inp_route6.ro_rt->rt_mtu; } #endif /* INET6 */ #if defined(INET) && defined(INET6) else #endif #ifdef INET { ip->ip_len = htons(m->m_pkthdr.len); #ifdef INET6 if (tp->t_inpcb->inp_vflag & INP_IPV6PROTO) ip->ip_ttl = in6_selecthlim(tp->t_inpcb, NULL); #endif /* INET6 */ /* * If we do path MTU discovery, then we set DF on every packet. * This might not be the best thing to do according to RFC3390 * Section 2. However the tcp hostcache migitates the problem * so it affects only the first tcp connection with a host. * * NB: Don't set DF on small MTU/MSS to have a safe fallback. */ if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) { ip->ip_off |= htons(IP_DF); tp->t_flags2 |= TF2_PLPMTU_PMTUD; } else { tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; } if (tp->t_state == TCPS_SYN_SENT) TCP_PROBE5(connect__request, NULL, tp, ip, tp, th); TCP_PROBE5(send, NULL, tp, ip, tp, th); #ifdef TCPPCAP /* Save packet, if requested. */ tcp_pcap_add(th, m, &(tp->t_outpkts)); #endif error = ip_output(m, tp->t_inpcb->inp_options, &tp->t_inpcb->inp_route, ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0), 0, tp->t_inpcb); if (error == EMSGSIZE && tp->t_inpcb->inp_route.ro_rt != NULL) mtu = tp->t_inpcb->inp_route.ro_rt->rt_mtu; } #endif /* INET */ out: /* * In transmit state, time the transmission and arrange for * the retransmit. In persist state, just set snd_max. */ if ((tp->t_flags & TF_FORCEDATA) == 0 || !tcp_timer_active(tp, TT_PERSIST)) { tcp_seq startseq = tp->snd_nxt; /* * Advance snd_nxt over sequence space of this segment. */ if (flags & (TH_SYN|TH_FIN)) { if (flags & TH_SYN) tp->snd_nxt++; if (flags & TH_FIN) { tp->snd_nxt++; tp->t_flags |= TF_SENTFIN; } } if (sack_rxmit) goto timer; tp->snd_nxt += len; if (SEQ_GT(tp->snd_nxt, tp->snd_max)) { tp->snd_max = tp->snd_nxt; /* * Time this transmission if not a retransmission and * not currently timing anything. */ if (tp->t_rtttime == 0) { tp->t_rtttime = ticks; tp->t_rtseq = startseq; TCPSTAT_INC(tcps_segstimed); } } /* * Set retransmit timer if not currently set, * and not doing a pure ack or a keep-alive probe. * Initial value for retransmit timer is smoothed * round-trip time + 2 * round-trip time variance. * Initialize shift counter which is used for backoff * of retransmit time. */ timer: if (!tcp_timer_active(tp, TT_REXMT) && ((sack_rxmit && tp->snd_nxt != tp->snd_max) || (tp->snd_nxt != tp->snd_una))) { if (tcp_timer_active(tp, TT_PERSIST)) { tcp_timer_activate(tp, TT_PERSIST, 0); tp->t_rxtshift = 0; } tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur); } else if (len == 0 && sbavail(&so->so_snd) && !tcp_timer_active(tp, TT_REXMT) && !tcp_timer_active(tp, TT_PERSIST)) { /* * Avoid a situation where we do not set persist timer * after a zero window condition. For example: * 1) A -> B: packet with enough data to fill the window * 2) B -> A: ACK for #1 + new data (0 window * advertisement) * 3) A -> B: ACK for #2, 0 len packet * * In this case, A will not activate the persist timer, * because it chose to send a packet. Unless tcp_output * is called for some other reason (delayed ack timer, * another input packet from B, socket syscall), A will * not send zero window probes. * * So, if you send a 0-length packet, but there is data * in the socket buffer, and neither the rexmt or * persist timer is already set, then activate the * persist timer. */ tp->t_rxtshift = 0; tcp_setpersist(tp); } } else { /* * Persist case, update snd_max but since we are in * persist mode (no window) we do not update snd_nxt. */ int xlen = len; if (flags & TH_SYN) ++xlen; if (flags & TH_FIN) { ++xlen; tp->t_flags |= TF_SENTFIN; } if (SEQ_GT(tp->snd_nxt + xlen, tp->snd_max)) tp->snd_max = tp->snd_nxt + xlen; } if (error) { /* Record the error. */ TCP_LOG_EVENT(tp, NULL, &so->so_rcv, &so->so_snd, TCP_LOG_OUT, error, 0, NULL, false); /* * We know that the packet was lost, so back out the * sequence number advance, if any. * * If the error is EPERM the packet got blocked by the * local firewall. Normally we should terminate the * connection but the blocking may have been spurious * due to a firewall reconfiguration cycle. So we treat * it like a packet loss and let the retransmit timer and * timeouts do their work over time. * XXX: It is a POLA question whether calling tcp_drop right * away would be the really correct behavior instead. */ if (((tp->t_flags & TF_FORCEDATA) == 0 || !tcp_timer_active(tp, TT_PERSIST)) && ((flags & TH_SYN) == 0) && (error != EPERM)) { if (sack_rxmit) { p->rxmit -= len; tp->sackhint.sack_bytes_rexmit -= len; KASSERT(tp->sackhint.sack_bytes_rexmit >= 0, ("sackhint bytes rtx >= 0")); } else tp->snd_nxt -= len; } SOCKBUF_UNLOCK_ASSERT(&so->so_snd); /* Check gotos. */ switch (error) { case EACCES: case EPERM: tp->t_softerror = error; return (error); case ENOBUFS: TCP_XMIT_TIMER_ASSERT(tp, len, flags); tp->snd_cwnd = tp->t_maxseg; return (0); case EMSGSIZE: /* * For some reason the interface we used initially * to send segments changed to another or lowered * its MTU. * If TSO was active we either got an interface * without TSO capabilits or TSO was turned off. * If we obtained mtu from ip_output() then update * it and try again. */ if (tso) tp->t_flags &= ~TF_TSO; if (mtu != 0) { tcp_mss_update(tp, -1, mtu, NULL, NULL); goto again; } return (error); case EHOSTDOWN: case EHOSTUNREACH: case ENETDOWN: case ENETUNREACH: if (TCPS_HAVERCVDSYN(tp->t_state)) { tp->t_softerror = error; return (0); } /* FALLTHROUGH */ default: return (error); } } TCPSTAT_INC(tcps_sndtotal); /* * Data sent (as far as we can tell). * If this advertises a larger window than any other segment, * then remember the size of the advertised window. * Any pending ACK has now been sent. */ if (SEQ_GT(tp->rcv_nxt + recwin, tp->rcv_adv)) tp->rcv_adv = tp->rcv_nxt + recwin; tp->last_ack_sent = tp->rcv_nxt; tp->t_flags &= ~(TF_ACKNOW | TF_DELACK); if (tcp_timer_active(tp, TT_DELACK)) tcp_timer_activate(tp, TT_DELACK, 0); #if 0 /* * This completely breaks TCP if newreno is turned on. What happens * is that if delayed-acks are turned on on the receiver, this code * on the transmitter effectively destroys the TCP window, forcing * it to four packets (1.5Kx4 = 6K window). */ if (sendalot && --maxburst) goto again; #endif if (sendalot) goto again; return (0); } void tcp_setpersist(struct tcpcb *tp) { int t = ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1; int tt; tp->t_flags &= ~TF_PREVVALID; if (tcp_timer_active(tp, TT_REXMT)) panic("tcp_setpersist: retransmit pending"); /* * Start/restart persistence timer. */ TCPT_RANGESET(tt, t * tcp_backoff[tp->t_rxtshift], tcp_persmin, tcp_persmax); tcp_timer_activate(tp, TT_PERSIST, tt); if (tp->t_rxtshift < TCP_MAXRXTSHIFT) tp->t_rxtshift++; } /* * Insert TCP options according to the supplied parameters to the place * optp in a consistent way. Can handle unaligned destinations. * * The order of the option processing is crucial for optimal packing and * alignment for the scarce option space. * * The optimal order for a SYN/SYN-ACK segment is: * MSS (4) + NOP (1) + Window scale (3) + SACK permitted (2) + * Timestamp (10) + Signature (18) = 38 bytes out of a maximum of 40. * * The SACK options should be last. SACK blocks consume 8*n+2 bytes. * So a full size SACK blocks option is 34 bytes (with 4 SACK blocks). * At minimum we need 10 bytes (to generate 1 SACK block). If both * TCP Timestamps (12 bytes) and TCP Signatures (18 bytes) are present, * we only have 10 bytes for SACK options (40 - (12 + 18)). */ int tcp_addoptions(struct tcpopt *to, u_char *optp) { u_int32_t mask, optlen = 0; for (mask = 1; mask < TOF_MAXOPT; mask <<= 1) { if ((to->to_flags & mask) != mask) continue; if (optlen == TCP_MAXOLEN) break; switch (to->to_flags & mask) { case TOF_MSS: while (optlen % 4) { optlen += TCPOLEN_NOP; *optp++ = TCPOPT_NOP; } if (TCP_MAXOLEN - optlen < TCPOLEN_MAXSEG) continue; optlen += TCPOLEN_MAXSEG; *optp++ = TCPOPT_MAXSEG; *optp++ = TCPOLEN_MAXSEG; to->to_mss = htons(to->to_mss); bcopy((u_char *)&to->to_mss, optp, sizeof(to->to_mss)); optp += sizeof(to->to_mss); break; case TOF_SCALE: while (!optlen || optlen % 2 != 1) { optlen += TCPOLEN_NOP; *optp++ = TCPOPT_NOP; } if (TCP_MAXOLEN - optlen < TCPOLEN_WINDOW) continue; optlen += TCPOLEN_WINDOW; *optp++ = TCPOPT_WINDOW; *optp++ = TCPOLEN_WINDOW; *optp++ = to->to_wscale; break; case TOF_SACKPERM: while (optlen % 2) { optlen += TCPOLEN_NOP; *optp++ = TCPOPT_NOP; } if (TCP_MAXOLEN - optlen < TCPOLEN_SACK_PERMITTED) continue; optlen += TCPOLEN_SACK_PERMITTED; *optp++ = TCPOPT_SACK_PERMITTED; *optp++ = TCPOLEN_SACK_PERMITTED; break; case TOF_TS: while (!optlen || optlen % 4 != 2) { optlen += TCPOLEN_NOP; *optp++ = TCPOPT_NOP; } if (TCP_MAXOLEN - optlen < TCPOLEN_TIMESTAMP) continue; optlen += TCPOLEN_TIMESTAMP; *optp++ = TCPOPT_TIMESTAMP; *optp++ = TCPOLEN_TIMESTAMP; to->to_tsval = htonl(to->to_tsval); to->to_tsecr = htonl(to->to_tsecr); bcopy((u_char *)&to->to_tsval, optp, sizeof(to->to_tsval)); optp += sizeof(to->to_tsval); bcopy((u_char *)&to->to_tsecr, optp, sizeof(to->to_tsecr)); optp += sizeof(to->to_tsecr); break; case TOF_SIGNATURE: { int siglen = TCPOLEN_SIGNATURE - 2; while (!optlen || optlen % 4 != 2) { optlen += TCPOLEN_NOP; *optp++ = TCPOPT_NOP; } if (TCP_MAXOLEN - optlen < TCPOLEN_SIGNATURE) { to->to_flags &= ~TOF_SIGNATURE; continue; } optlen += TCPOLEN_SIGNATURE; *optp++ = TCPOPT_SIGNATURE; *optp++ = TCPOLEN_SIGNATURE; to->to_signature = optp; while (siglen--) *optp++ = 0; break; } case TOF_SACK: { int sackblks = 0; struct sackblk *sack = (struct sackblk *)to->to_sacks; tcp_seq sack_seq; while (!optlen || optlen % 4 != 2) { optlen += TCPOLEN_NOP; *optp++ = TCPOPT_NOP; } if (TCP_MAXOLEN - optlen < TCPOLEN_SACKHDR + TCPOLEN_SACK) continue; optlen += TCPOLEN_SACKHDR; *optp++ = TCPOPT_SACK; sackblks = min(to->to_nsacks, (TCP_MAXOLEN - optlen) / TCPOLEN_SACK); *optp++ = TCPOLEN_SACKHDR + sackblks * TCPOLEN_SACK; while (sackblks--) { sack_seq = htonl(sack->start); bcopy((u_char *)&sack_seq, optp, sizeof(sack_seq)); optp += sizeof(sack_seq); sack_seq = htonl(sack->end); bcopy((u_char *)&sack_seq, optp, sizeof(sack_seq)); optp += sizeof(sack_seq); optlen += TCPOLEN_SACK; sack++; } TCPSTAT_INC(tcps_sack_send_blocks); break; } case TOF_FASTOPEN: { int total_len; /* XXX is there any point to aligning this option? */ total_len = TCPOLEN_FAST_OPEN_EMPTY + to->to_tfo_len; if (TCP_MAXOLEN - optlen < total_len) { to->to_flags &= ~TOF_FASTOPEN; continue; } *optp++ = TCPOPT_FAST_OPEN; *optp++ = total_len; if (to->to_tfo_len > 0) { bcopy(to->to_tfo_cookie, optp, to->to_tfo_len); optp += to->to_tfo_len; } optlen += total_len; break; } default: panic("%s: unknown TCP option type", __func__); break; } } /* Terminate and pad TCP options to a 4 byte boundary. */ if (optlen % 4) { optlen += TCPOLEN_EOL; *optp++ = TCPOPT_EOL; } /* * According to RFC 793 (STD0007): * "The content of the header beyond the End-of-Option option * must be header padding (i.e., zero)." * and later: "The padding is composed of zeros." */ while (optlen % 4) { optlen += TCPOLEN_PAD; *optp++ = TCPOPT_PAD; } KASSERT(optlen <= TCP_MAXOLEN, ("%s: TCP options too long", __func__)); return (optlen); +} + +/* + * This is a copy of m_copym(), taking the TSO segment size/limit + * constraints into account, and advancing the sndptr as it goes. + */ +struct mbuf * +tcp_m_copym(struct mbuf *m, int32_t off0, int32_t *plen, + int32_t seglimit, int32_t segsize, struct sockbuf *sb) +{ + struct mbuf *n, **np; + struct mbuf *top; + int32_t off = off0; + int32_t len = *plen; + int32_t fragsize; + int32_t len_cp = 0; + int32_t *pkthdrlen; + uint32_t mlen, frags; + bool copyhdr; + + + KASSERT(off >= 0, ("tcp_m_copym, negative off %d", off)); + KASSERT(len >= 0, ("tcp_m_copym, negative len %d", len)); + if (off == 0 && m->m_flags & M_PKTHDR) + copyhdr = true; + else + copyhdr = false; + while (off > 0) { + KASSERT(m != NULL, ("tcp_m_copym, offset > size of mbuf chain")); + if (off < m->m_len) + break; + off -= m->m_len; + if ((sb) && (m == sb->sb_sndptr)) { + sb->sb_sndptroff += m->m_len; + sb->sb_sndptr = m->m_next; + } + m = m->m_next; + } + np = ⊤ + top = NULL; + pkthdrlen = NULL; + while (len > 0) { + if (m == NULL) { + KASSERT(len == M_COPYALL, + ("tcp_m_copym, length > size of mbuf chain")); + *plen = len_cp; + if (pkthdrlen != NULL) + *pkthdrlen = len_cp; + break; + } + mlen = min(len, m->m_len - off); + if (seglimit) { + /* + * For M_NOMAP mbufs, add 3 segments + * + 1 in case we are crossing page boundaries + * + 2 in case the TLS hdr/trailer are used + * It is cheaper to just add the segments + * than it is to take the cache miss to look + * at the mbuf ext_pgs state in detail. + */ + if (m->m_flags & M_NOMAP) { + fragsize = min(segsize, PAGE_SIZE); + frags = 3; + } else { + fragsize = segsize; + frags = 0; + } + + /* Break if we really can't fit anymore. */ + if ((frags + 1) >= seglimit) { + *plen = len_cp; + if (pkthdrlen != NULL) + *pkthdrlen = len_cp; + break; + } + + /* + * Reduce size if you can't copy the whole + * mbuf. If we can't copy the whole mbuf, also + * adjust len so the loop will end after this + * mbuf. + */ + if ((frags + howmany(mlen, fragsize)) >= seglimit) { + mlen = (seglimit - frags - 1) * fragsize; + len = mlen; + *plen = len_cp + len; + if (pkthdrlen != NULL) + *pkthdrlen = *plen; + } + frags += howmany(mlen, fragsize); + if (frags == 0) + frags++; + seglimit -= frags; + KASSERT(seglimit > 0, + ("%s: seglimit went too low", __func__)); + } + if (copyhdr) + n = m_gethdr(M_NOWAIT, m->m_type); + else + n = m_get(M_NOWAIT, m->m_type); + *np = n; + if (n == NULL) + goto nospace; + if (copyhdr) { + if (!m_dup_pkthdr(n, m, M_NOWAIT)) + goto nospace; + if (len == M_COPYALL) + n->m_pkthdr.len -= off0; + else + n->m_pkthdr.len = len; + pkthdrlen = &n->m_pkthdr.len; + copyhdr = false; + } + n->m_len = mlen; + len_cp += n->m_len; + if (m->m_flags & M_EXT) { + n->m_data = m->m_data + off; + mb_dupcl(n, m); + } else + bcopy(mtod(m, caddr_t)+off, mtod(n, caddr_t), + (u_int)n->m_len); + + if (sb && (sb->sb_sndptr == m) && + ((n->m_len + off) >= m->m_len) && m->m_next) { + sb->sb_sndptroff += m->m_len; + sb->sb_sndptr = m->m_next; + } + off = 0; + if (len != M_COPYALL) { + len -= n->m_len; + } + m = m->m_next; + np = &n->m_next; + } + return (top); +nospace: + m_freem(top); + return (NULL); } void tcp_sndbuf_autoscale(struct tcpcb *tp, struct socket *so, uint32_t sendwin) { /* * Automatic sizing of send socket buffer. Often the send buffer * size is not optimally adjusted to the actual network conditions * at hand (delay bandwidth product). Setting the buffer size too * small limits throughput on links with high bandwidth and high * delay (eg. trans-continental/oceanic links). Setting the * buffer size too big consumes too much real kernel memory, * especially with many connections on busy servers. * * The criteria to step up the send buffer one notch are: * 1. receive window of remote host is larger than send buffer * (with a fudge factor of 5/4th); * 2. send buffer is filled to 7/8th with data (so we actually * have data to make use of it); * 3. send buffer fill has not hit maximal automatic size; * 4. our send window (slow start and cogestion controlled) is * larger than sent but unacknowledged data in send buffer. * * The remote host receive window scaling factor may limit the * growing of the send buffer before it reaches its allowed * maximum. * * It scales directly with slow start or congestion window * and does at most one step per received ACK. This fast * scaling has the drawback of growing the send buffer beyond * what is strictly necessary to make full use of a given * delay*bandwidth product. However testing has shown this not * to be much of an problem. At worst we are trading wasting * of available bandwidth (the non-use of it) for wasting some * socket buffer memory. * * TODO: Shrink send buffer during idle periods together * with congestion window. Requires another timer. Has to * wait for upcoming tcp timer rewrite. * * XXXGL: should there be used sbused() or sbavail()? */ if (V_tcp_do_autosndbuf && so->so_snd.sb_flags & SB_AUTOSIZE) { int lowat; lowat = V_tcp_sendbuf_auto_lowat ? so->so_snd.sb_lowat : 0; if ((tp->snd_wnd / 4 * 5) >= so->so_snd.sb_hiwat - lowat && sbused(&so->so_snd) >= (so->so_snd.sb_hiwat / 8 * 7) - lowat && sbused(&so->so_snd) < V_tcp_autosndbuf_max && sendwin >= (sbused(&so->so_snd) - (tp->snd_nxt - tp->snd_una))) { if (!sbreserve_locked(&so->so_snd, min(so->so_snd.sb_hiwat + V_tcp_autosndbuf_inc, V_tcp_autosndbuf_max), so, curthread)) so->so_snd.sb_flags &= ~SB_AUTOSIZE; } } } Index: head/sys/netinet/tcp_stacks/fastpath.c =================================================================== --- head/sys/netinet/tcp_stacks/fastpath.c (revision 334803) +++ head/sys/netinet/tcp_stacks/fastpath.c (revision 334804) @@ -1,2438 +1,2438 @@ /*- * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995 * The Regents of the University of California. All rights reserved. * Copyright (c) 2007-2008,2010 * Swinburne University of Technology, Melbourne, Australia. * Copyright (c) 2009-2010 Lawrence Stewart * Copyright (c) 2010 The FreeBSD Foundation * Copyright (c) 2010-2011 Juniper Networks, Inc. * Copyright (c) 2015 Netflix Inc. * All rights reserved. * * Portions of this software were developed at the Centre for Advanced Internet * Architectures, Swinburne University of Technology, by Lawrence Stewart, * James Healy and David Hayes, made possible in part by a grant from the Cisco * University Research Program Fund at Community Foundation Silicon Valley. * * Portions of this software were developed at the Centre for Advanced * Internet Architectures, Swinburne University of Technology, Melbourne, * Australia by David Hayes under sponsorship from the FreeBSD Foundation. * * Portions of this software were developed by Robert N. M. Watson under * contract to Juniper Networks, Inc. * * Portions of this software were developed by Randall R. Stewart while * working for Netflix Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)tcp_input.c 8.12 (Berkeley) 5/24/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_tcpdebug.h" #include #include #include #include #include #ifdef TCP_HHOOK #include #endif #include #include #include /* for proc0 declaration */ #include #include #include #include #include #include #include #include #include /* before tcp_seq.h, for tcp_random18() */ #include #include #include #define TCPSTATES /* for logging */ #include #include #include #include #include #include /* required for icmp_var.h */ #include /* for ICMP_BANDLIM */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef TCPDEBUG #include #endif /* TCPDEBUG */ #ifdef TCP_OFFLOAD #include #endif #include #include static void tcp_do_segment_fastslow(struct mbuf *, struct tcphdr *, struct socket *, struct tcpcb *, int, int, uint8_t, int); static void tcp_do_segment_fastack(struct mbuf *, struct tcphdr *, struct socket *, struct tcpcb *, int, int, uint8_t, int); /* * Indicate whether this ack should be delayed. We can delay the ack if * following conditions are met: * - There is no delayed ack timer in progress. * - Our last ack wasn't a 0-sized window. We never want to delay * the ack that opens up a 0-sized window. * - LRO wasn't used for this segment. We make sure by checking that the * segment size is not larger than the MSS. */ #define DELAY_ACK(tp, tlen) \ ((!tcp_timer_active(tp, TT_DELACK) && \ (tp->t_flags & TF_RXWIN0SENT) == 0) && \ (tlen <= tp->t_maxseg) && \ (V_tcp_delack_enabled || (tp->t_flags & TF_NEEDSYN))) /* * So how is this faster than the normal fast ack? * It basically allows us to also stay in the fastpath * when a window-update ack also arrives. In testing * we saw only 25-30% of connections doing fastpath * due to the fact that along with moving forward * in sequence the window was also updated. */ static void tcp_do_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int drop_hdrlen, int tlen, int ti_locked, uint32_t tiwin) { int acked; uint16_t nsegs; int winup_only=0; nsegs = max(1, m->m_pkthdr.lro_nsegs); #ifdef TCPDEBUG /* * The size of tcp_saveipgen must be the size of the max ip header, * now IPv6. */ u_char tcp_saveipgen[IP6_HDR_LEN]; struct tcphdr tcp_savetcp; short ostate = 0; #endif /* * The following if statement will be true if * we are doing the win_up_in_fp * - We have more new data (SEQ_LT(tp->snd_wl1, th->th_seq)) * - No more new data, but we have an ack for new data * (tp->snd_wl1 == th->th_seq && SEQ_LT(tp->snd_wl2, th->th_ack)) * - No more new data, the same ack point but the window grew * (tp->snd_wl1 == th->th_seq && tp->snd_wl2 == th->th_ack && twin > tp->snd_wnd) */ if ((SEQ_LT(tp->snd_wl1, th->th_seq) || (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) || (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) { /* keep track of pure window updates */ if (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) { winup_only = 1; TCPSTAT_INC(tcps_rcvwinupd); } tp->snd_wnd = tiwin; tp->snd_wl1 = th->th_seq; tp->snd_wl2 = th->th_ack; if (tp->snd_wnd > tp->max_sndwnd) tp->max_sndwnd = tp->snd_wnd; } /* * If last ACK falls within this segment's sequence numbers, * record the timestamp. * NOTE that the test is modified according to the latest * proposal of the tcplw@cray.com list (Braden 1993/04/26). */ if ((to->to_flags & TOF_TS) != 0 && SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { tp->ts_recent_age = tcp_ts_getticks(); tp->ts_recent = to->to_tsval; } /* * This is a pure ack for outstanding data. */ if (ti_locked == TI_RLOCKED) { INP_INFO_RUNLOCK(&V_tcbinfo); } ti_locked = TI_UNLOCKED; TCPSTAT_INC(tcps_predack); /* * "bad retransmit" recovery. */ if (tp->t_rxtshift == 1 && tp->t_flags & TF_PREVVALID && (int)(ticks - tp->t_badrxtwin) < 0) { cc_cong_signal(tp, th, CC_RTO_ERR); } /* * Recalculate the transmit timer / rtt. * * Some boxes send broken timestamp replies * during the SYN+ACK phase, ignore * timestamps of 0 or we could calculate a * huge RTT and blow up the retransmit timer. */ if ((to->to_flags & TOF_TS) != 0 && to->to_tsecr) { uint32_t t; t = tcp_ts_getticks() - to->to_tsecr; if (!tp->t_rttlow || tp->t_rttlow > t) tp->t_rttlow = t; tcp_xmit_timer(tp, TCP_TS_TO_TICKS(t) + 1); } else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) { if (!tp->t_rttlow || tp->t_rttlow > ticks - tp->t_rtttime) tp->t_rttlow = ticks - tp->t_rtttime; tcp_xmit_timer(tp, ticks - tp->t_rtttime); } if (winup_only == 0) { acked = BYTES_THIS_ACK(tp, th); #ifdef TCP_HHOOK /* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */ hhook_run_tcp_est_in(tp, th, to); #endif TCPSTAT_ADD(tcps_rcvackbyte, acked); sbdrop(&so->so_snd, acked); if (SEQ_GT(tp->snd_una, tp->snd_recover) && SEQ_LEQ(th->th_ack, tp->snd_recover)) tp->snd_recover = th->th_ack - 1; /* * Let the congestion control algorithm update * congestion control related information. This * typically means increasing the congestion * window. */ cc_ack_received(tp, th, nsegs, CC_ACK); tp->snd_una = th->th_ack; /* * Pull snd_wl2 up to prevent seq wrap relative * to th_ack. */ tp->snd_wl2 = th->th_ack; tp->t_dupacks = 0; /* * If all outstanding data are acked, stop * retransmit timer, otherwise restart timer * using current (possibly backed-off) value. * If process is waiting for space, * wakeup/selwakeup/signal. If data * are ready to send, let tcp_output * decide between more output or persist. */ #ifdef TCPDEBUG if (so->so_options & SO_DEBUG) tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif TCP_PROBE3(debug__input, tp, th, m); m_freem(m); if (tp->snd_una == tp->snd_max) tcp_timer_activate(tp, TT_REXMT, 0); else if (!tcp_timer_active(tp, TT_PERSIST)) tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur); } else { /* * Window update only, just free the mbufs and * send out whatever we can. */ m_freem(m); } sowwakeup(so); if (sbavail(&so->so_snd)) (void) tcp_output(tp); KASSERT(ti_locked == TI_UNLOCKED, ("%s: check_delack ti_locked %d", __func__, ti_locked)); INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(tp->t_inpcb); if (tp->t_flags & TF_DELACK) { tp->t_flags &= ~TF_DELACK; tcp_timer_activate(tp, TT_DELACK, tcp_delacktime); } INP_WUNLOCK(tp->t_inpcb); } /* * Here nothing is really faster, its just that we * have broken out the fast-data path also just like * the fast-ack. */ static void tcp_do_fastnewdata(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int drop_hdrlen, int tlen, int ti_locked, uint32_t tiwin) { int newsize = 0; /* automatic sockbuf scaling */ #ifdef TCPDEBUG /* * The size of tcp_saveipgen must be the size of the max ip header, * now IPv6. */ u_char tcp_saveipgen[IP6_HDR_LEN]; struct tcphdr tcp_savetcp; short ostate = 0; #endif /* * If last ACK falls within this segment's sequence numbers, * record the timestamp. * NOTE that the test is modified according to the latest * proposal of the tcplw@cray.com list (Braden 1993/04/26). */ if ((to->to_flags & TOF_TS) != 0 && SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { tp->ts_recent_age = tcp_ts_getticks(); tp->ts_recent = to->to_tsval; } /* * This is a pure, in-sequence data packet with * nothing on the reassembly queue and we have enough * buffer space to take it. */ if (ti_locked == TI_RLOCKED) { INP_INFO_RUNLOCK(&V_tcbinfo); } ti_locked = TI_UNLOCKED; /* Clean receiver SACK report if present */ if ((tp->t_flags & TF_SACK_PERMIT) && tp->rcv_numsacks) tcp_clean_sackreport(tp); TCPSTAT_INC(tcps_preddat); tp->rcv_nxt += tlen; /* * Pull snd_wl1 up to prevent seq wrap relative to * th_seq. */ tp->snd_wl1 = th->th_seq; /* * Pull rcv_up up to prevent seq wrap relative to * rcv_nxt. */ tp->rcv_up = tp->rcv_nxt; TCPSTAT_ADD(tcps_rcvbyte, tlen); #ifdef TCPDEBUG if (so->so_options & SO_DEBUG) tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif TCP_PROBE3(debug__input, tp, th, m); newsize = tcp_autorcvbuf(m, th, so, tp, tlen); /* Add data to socket buffer. */ SOCKBUF_LOCK(&so->so_rcv); if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { m_freem(m); } else { /* * Set new socket buffer size. * Give up when limit is reached. */ if (newsize) if (!sbreserve_locked(&so->so_rcv, newsize, so, NULL)) so->so_rcv.sb_flags &= ~SB_AUTOSIZE; m_adj(m, drop_hdrlen); /* delayed header drop */ sbappendstream_locked(&so->so_rcv, m, 0); } /* NB: sorwakeup_locked() does an implicit unlock. */ sorwakeup_locked(so); if (DELAY_ACK(tp, tlen)) { tp->t_flags |= TF_DELACK; } else { tp->t_flags |= TF_ACKNOW; tcp_output(tp); } KASSERT(ti_locked == TI_UNLOCKED, ("%s: check_delack ti_locked %d", __func__, ti_locked)); INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(tp->t_inpcb); if (tp->t_flags & TF_DELACK) { tp->t_flags &= ~TF_DELACK; tcp_timer_activate(tp, TT_DELACK, tcp_delacktime); } INP_WUNLOCK(tp->t_inpcb); } /* * The slow-path is the clone of the long long part * of tcp_do_segment past all the fast-path stuff. We * use it here by two different callers, the fast/slow and * the fastack only. */ static void tcp_do_slowpath(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int drop_hdrlen, int tlen, int ti_locked, uint32_t tiwin, int thflags) { int acked, ourfinisacked, needoutput = 0; int rstreason, todrop, win; uint16_t nsegs; char *s; struct in_conninfo *inc; struct mbuf *mfree = NULL; nsegs = max(1, m->m_pkthdr.lro_nsegs); #ifdef TCPDEBUG /* * The size of tcp_saveipgen must be the size of the max ip header, * now IPv6. */ u_char tcp_saveipgen[IP6_HDR_LEN]; struct tcphdr tcp_savetcp; short ostate = 0; #endif /* * Calculate amount of space in receive window, * and then do TCP input processing. * Receive window is amount of space in rcv queue, * but not less than advertised window. */ inc = &tp->t_inpcb->inp_inc; win = sbspace(&so->so_rcv); if (win < 0) win = 0; tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt)); switch (tp->t_state) { /* * If the state is SYN_RECEIVED: * if seg contains an ACK, but not for our SYN/ACK, send a RST. */ case TCPS_SYN_RECEIVED: if ((thflags & TH_ACK) && (SEQ_LEQ(th->th_ack, tp->snd_una) || SEQ_GT(th->th_ack, tp->snd_max))) { rstreason = BANDLIM_RST_OPENPORT; goto dropwithreset; } break; /* * If the state is SYN_SENT: * if seg contains a RST with valid ACK (SEQ.ACK has already * been verified), then drop the connection. * if seg contains a RST without an ACK, drop the seg. * if seg does not contain SYN, then drop the seg. * Otherwise this is an acceptable SYN segment * initialize tp->rcv_nxt and tp->irs * if seg contains ack then advance tp->snd_una * if seg contains an ECE and ECN support is enabled, the stream * is ECN capable. * if SYN has been acked change to ESTABLISHED else SYN_RCVD state * arrange for segment to be acked (eventually) * continue processing rest of data/controls, beginning with URG */ case TCPS_SYN_SENT: if ((thflags & (TH_ACK|TH_RST)) == (TH_ACK|TH_RST)) { TCP_PROBE5(connect__refused, NULL, tp, m, tp, th); tp = tcp_drop(tp, ECONNREFUSED); } if (thflags & TH_RST) goto drop; if (!(thflags & TH_SYN)) goto drop; tp->irs = th->th_seq; tcp_rcvseqinit(tp); if (thflags & TH_ACK) { TCPSTAT_INC(tcps_connects); soisconnected(so); #ifdef MAC mac_socketpeer_set_from_mbuf(m, so); #endif /* Do window scaling on this connection? */ if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == (TF_RCVD_SCALE|TF_REQ_SCALE)) { tp->rcv_scale = tp->request_r_scale; } tp->rcv_adv += min(tp->rcv_wnd, TCP_MAXWIN << tp->rcv_scale); tp->snd_una++; /* SYN is acked */ /* * If there's data, delay ACK; if there's also a FIN * ACKNOW will be turned on later. */ if (DELAY_ACK(tp, tlen) && tlen != 0) tcp_timer_activate(tp, TT_DELACK, tcp_delacktime); else tp->t_flags |= TF_ACKNOW; if ((thflags & TH_ECE) && V_tcp_do_ecn) { tp->t_flags |= TF_ECN_PERMIT; TCPSTAT_INC(tcps_ecn_shs); } /* * Received in SYN_SENT[*] state. * Transitions: * SYN_SENT --> ESTABLISHED * SYN_SENT* --> FIN_WAIT_1 */ tp->t_starttime = ticks; if (tp->t_flags & TF_NEEDFIN) { tcp_state_change(tp, TCPS_FIN_WAIT_1); tp->t_flags &= ~TF_NEEDFIN; thflags &= ~TH_SYN; } else { tcp_state_change(tp, TCPS_ESTABLISHED); TCP_PROBE5(connect__established, NULL, tp, m, tp, th); cc_conn_init(tp); tcp_timer_activate(tp, TT_KEEP, TP_KEEPIDLE(tp)); } } else { /* * Received initial SYN in SYN-SENT[*] state => * simultaneous open. * If it succeeds, connection is * half-synchronized. * Otherwise, do 3-way handshake: * SYN-SENT -> SYN-RECEIVED * SYN-SENT* -> SYN-RECEIVED* */ tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN); tcp_timer_activate(tp, TT_REXMT, 0); tcp_state_change(tp, TCPS_SYN_RECEIVED); } KASSERT(ti_locked == TI_RLOCKED, ("%s: trimthenstep6: " "ti_locked %d", __func__, ti_locked)); INP_INFO_RLOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(tp->t_inpcb); /* * Advance th->th_seq to correspond to first data byte. * If data, trim to stay within window, * dropping FIN if necessary. */ th->th_seq++; if (tlen > tp->rcv_wnd) { todrop = tlen - tp->rcv_wnd; m_adj(m, -todrop); tlen = tp->rcv_wnd; thflags &= ~TH_FIN; TCPSTAT_INC(tcps_rcvpackafterwin); TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop); } tp->snd_wl1 = th->th_seq - 1; tp->rcv_up = th->th_seq; /* * Client side of transaction: already sent SYN and data. * If the remote host used T/TCP to validate the SYN, * our data will be ACK'd; if so, enter normal data segment * processing in the middle of step 5, ack processing. * Otherwise, goto step 6. */ if (thflags & TH_ACK) goto process_ACK; goto step6; /* * If the state is LAST_ACK or CLOSING or TIME_WAIT: * do normal processing. * * NB: Leftover from RFC1644 T/TCP. Cases to be reused later. */ case TCPS_LAST_ACK: case TCPS_CLOSING: break; /* continue normal processing */ } /* * States other than LISTEN or SYN_SENT. * First check the RST flag and sequence number since reset segments * are exempt from the timestamp and connection count tests. This * fixes a bug introduced by the Stevens, vol. 2, p. 960 bugfix * below which allowed reset segments in half the sequence space * to fall though and be processed (which gives forged reset * segments with a random sequence number a 50 percent chance of * killing a connection). * Then check timestamp, if present. * Then check the connection count, if present. * Then check that at least some bytes of segment are within * receive window. If segment begins before rcv_nxt, * drop leading data (and SYN); if nothing left, just ack. */ if (thflags & TH_RST) { /* * RFC5961 Section 3.2 * * - RST drops connection only if SEG.SEQ == RCV.NXT. * - If RST is in window, we send challenge ACK. * * Note: to take into account delayed ACKs, we should * test against last_ack_sent instead of rcv_nxt. * Note 2: we handle special case of closed window, not * covered by the RFC. */ if ((SEQ_GEQ(th->th_seq, tp->last_ack_sent) && SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) || (tp->rcv_wnd == 0 && tp->last_ack_sent == th->th_seq)) { INP_INFO_RLOCK_ASSERT(&V_tcbinfo); KASSERT(ti_locked == TI_RLOCKED, ("%s: TH_RST ti_locked %d, th %p tp %p", __func__, ti_locked, th, tp)); KASSERT(tp->t_state != TCPS_SYN_SENT, ("%s: TH_RST for TCPS_SYN_SENT th %p tp %p", __func__, th, tp)); if (V_tcp_insecure_rst || tp->last_ack_sent == th->th_seq) { TCPSTAT_INC(tcps_drops); /* Drop the connection. */ switch (tp->t_state) { case TCPS_SYN_RECEIVED: so->so_error = ECONNREFUSED; goto close; case TCPS_ESTABLISHED: case TCPS_FIN_WAIT_1: case TCPS_FIN_WAIT_2: case TCPS_CLOSE_WAIT: case TCPS_CLOSING: case TCPS_LAST_ACK: so->so_error = ECONNRESET; close: /* FALLTHROUGH */ default: tp = tcp_close(tp); } } else { TCPSTAT_INC(tcps_badrst); /* Send challenge ACK. */ tcp_respond(tp, mtod(m, void *), th, m, tp->rcv_nxt, tp->snd_nxt, TH_ACK); tp->last_ack_sent = tp->rcv_nxt; m = NULL; } } goto drop; } /* * RFC5961 Section 4.2 * Send challenge ACK for any SYN in synchronized state. */ if ((thflags & TH_SYN) && tp->t_state != TCPS_SYN_SENT) { KASSERT(ti_locked == TI_RLOCKED, ("tcp_do_segment: TH_SYN ti_locked %d", ti_locked)); INP_INFO_RLOCK_ASSERT(&V_tcbinfo); TCPSTAT_INC(tcps_badsyn); if (V_tcp_insecure_syn && SEQ_GEQ(th->th_seq, tp->last_ack_sent) && SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) { tp = tcp_drop(tp, ECONNRESET); rstreason = BANDLIM_UNLIMITED; } else { /* Send challenge ACK. */ tcp_respond(tp, mtod(m, void *), th, m, tp->rcv_nxt, tp->snd_nxt, TH_ACK); tp->last_ack_sent = tp->rcv_nxt; m = NULL; } goto drop; } /* * RFC 1323 PAWS: If we have a timestamp reply on this segment * and it's less than ts_recent, drop it. */ if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && TSTMP_LT(to->to_tsval, tp->ts_recent)) { /* Check to see if ts_recent is over 24 days old. */ if (tcp_ts_getticks() - tp->ts_recent_age > TCP_PAWS_IDLE) { /* * Invalidate ts_recent. If this segment updates * ts_recent, the age will be reset later and ts_recent * will get a valid value. If it does not, setting * ts_recent to zero will at least satisfy the * requirement that zero be placed in the timestamp * echo reply when ts_recent isn't valid. The * age isn't reset until we get a valid ts_recent * because we don't want out-of-order segments to be * dropped when ts_recent is old. */ tp->ts_recent = 0; } else { TCPSTAT_INC(tcps_rcvduppack); TCPSTAT_ADD(tcps_rcvdupbyte, tlen); TCPSTAT_INC(tcps_pawsdrop); if (tlen) goto dropafterack; goto drop; } } /* * In the SYN-RECEIVED state, validate that the packet belongs to * this connection before trimming the data to fit the receive * window. Check the sequence number versus IRS since we know * the sequence numbers haven't wrapped. This is a partial fix * for the "LAND" DoS attack. */ if (tp->t_state == TCPS_SYN_RECEIVED && SEQ_LT(th->th_seq, tp->irs)) { rstreason = BANDLIM_RST_OPENPORT; goto dropwithreset; } todrop = tp->rcv_nxt - th->th_seq; if (todrop > 0) { if (thflags & TH_SYN) { thflags &= ~TH_SYN; th->th_seq++; if (th->th_urp > 1) th->th_urp--; else thflags &= ~TH_URG; todrop--; } /* * Following if statement from Stevens, vol. 2, p. 960. */ if (todrop > tlen || (todrop == tlen && (thflags & TH_FIN) == 0)) { /* * Any valid FIN must be to the left of the window. * At this point the FIN must be a duplicate or out * of sequence; drop it. */ thflags &= ~TH_FIN; /* * Send an ACK to resynchronize and drop any data. * But keep on processing for RST or ACK. */ tp->t_flags |= TF_ACKNOW; todrop = tlen; TCPSTAT_INC(tcps_rcvduppack); TCPSTAT_ADD(tcps_rcvdupbyte, todrop); } else { TCPSTAT_INC(tcps_rcvpartduppack); TCPSTAT_ADD(tcps_rcvpartdupbyte, todrop); } drop_hdrlen += todrop; /* drop from the top afterwards */ th->th_seq += todrop; tlen -= todrop; if (th->th_urp > todrop) th->th_urp -= todrop; else { thflags &= ~TH_URG; th->th_urp = 0; } } /* * If new data are received on a connection after the * user processes are gone, then RST the other end. */ if ((so->so_state & SS_NOFDREF) && tp->t_state > TCPS_CLOSE_WAIT && tlen) { KASSERT(ti_locked == TI_RLOCKED, ("%s: SS_NOFDEREF && " "CLOSE_WAIT && tlen ti_locked %d", __func__, ti_locked)); INP_INFO_RLOCK_ASSERT(&V_tcbinfo); if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: %s: Received %d bytes of data " "after socket was closed, " "sending RST and removing tcpcb\n", s, __func__, tcpstates[tp->t_state], tlen); free(s, M_TCPLOG); } tp = tcp_close(tp); TCPSTAT_INC(tcps_rcvafterclose); rstreason = BANDLIM_UNLIMITED; goto dropwithreset; } /* * If segment ends after window, drop trailing data * (and PUSH and FIN); if nothing left, just ACK. */ todrop = (th->th_seq + tlen) - (tp->rcv_nxt + tp->rcv_wnd); if (todrop > 0) { TCPSTAT_INC(tcps_rcvpackafterwin); if (todrop >= tlen) { TCPSTAT_ADD(tcps_rcvbyteafterwin, tlen); /* * If window is closed can only take segments at * window edge, and have to drop data and PUSH from * incoming segments. Continue processing, but * remember to ack. Otherwise, drop segment * and ack. */ if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) { tp->t_flags |= TF_ACKNOW; TCPSTAT_INC(tcps_rcvwinprobe); } else goto dropafterack; } else TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop); m_adj(m, -todrop); tlen -= todrop; thflags &= ~(TH_PUSH|TH_FIN); } /* * If last ACK falls within this segment's sequence numbers, * record its timestamp. * NOTE: * 1) That the test incorporates suggestions from the latest * proposal of the tcplw@cray.com list (Braden 1993/04/26). * 2) That updating only on newer timestamps interferes with * our earlier PAWS tests, so this check should be solely * predicated on the sequence space of this segment. * 3) That we modify the segment boundary check to be * Last.ACK.Sent <= SEG.SEQ + SEG.Len * instead of RFC1323's * Last.ACK.Sent < SEG.SEQ + SEG.Len, * This modified check allows us to overcome RFC1323's * limitations as described in Stevens TCP/IP Illustrated * Vol. 2 p.869. In such cases, we can still calculate the * RTT correctly when RCV.NXT == Last.ACK.Sent. */ if ((to->to_flags & TOF_TS) != 0 && SEQ_LEQ(th->th_seq, tp->last_ack_sent) && SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + ((thflags & (TH_SYN|TH_FIN)) != 0))) { tp->ts_recent_age = tcp_ts_getticks(); tp->ts_recent = to->to_tsval; } /* * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN * flag is on (half-synchronized state), then queue data for * later processing; else drop segment and return. */ if ((thflags & TH_ACK) == 0) { if (tp->t_state == TCPS_SYN_RECEIVED || (tp->t_flags & TF_NEEDSYN)) goto step6; else if (tp->t_flags & TF_ACKNOW) goto dropafterack; else goto drop; } /* * Ack processing. */ switch (tp->t_state) { /* * In SYN_RECEIVED state, the ack ACKs our SYN, so enter * ESTABLISHED state and continue processing. * The ACK was checked above. */ case TCPS_SYN_RECEIVED: TCPSTAT_INC(tcps_connects); soisconnected(so); /* Do window scaling? */ if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == (TF_RCVD_SCALE|TF_REQ_SCALE)) { tp->rcv_scale = tp->request_r_scale; tp->snd_wnd = tiwin; } /* * Make transitions: * SYN-RECEIVED -> ESTABLISHED * SYN-RECEIVED* -> FIN-WAIT-1 */ tp->t_starttime = ticks; if (tp->t_flags & TF_NEEDFIN) { tcp_state_change(tp, TCPS_FIN_WAIT_1); tp->t_flags &= ~TF_NEEDFIN; } else { tcp_state_change(tp, TCPS_ESTABLISHED); TCP_PROBE5(accept__established, NULL, tp, m, tp, th); cc_conn_init(tp); tcp_timer_activate(tp, TT_KEEP, TP_KEEPIDLE(tp)); } /* * If segment contains data or ACK, will call tcp_reass() * later; if not, do so now to pass queued data to user. */ if (tlen == 0 && (thflags & TH_FIN) == 0) (void) tcp_reass(tp, (struct tcphdr *)0, 0, (struct mbuf *)0); tp->snd_wl1 = th->th_seq - 1; /* FALLTHROUGH */ /* * In ESTABLISHED state: drop duplicate ACKs; ACK out of range * ACKs. If the ack is in the range * tp->snd_una < th->th_ack <= tp->snd_max * then advance tp->snd_una to th->th_ack and drop * data from the retransmission queue. If this ACK reflects * more up to date window information we update our window information. */ case TCPS_ESTABLISHED: case TCPS_FIN_WAIT_1: case TCPS_FIN_WAIT_2: case TCPS_CLOSE_WAIT: case TCPS_CLOSING: case TCPS_LAST_ACK: if (SEQ_GT(th->th_ack, tp->snd_max)) { TCPSTAT_INC(tcps_rcvacktoomuch); goto dropafterack; } if ((tp->t_flags & TF_SACK_PERMIT) && ((to->to_flags & TOF_SACK) || !TAILQ_EMPTY(&tp->snd_holes))) tcp_sack_doack(tp, to, th->th_ack); else /* * Reset the value so that previous (valid) value * from the last ack with SACK doesn't get used. */ tp->sackhint.sacked_bytes = 0; #ifdef TCP_HHOOK /* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */ hhook_run_tcp_est_in(tp, th, to); #endif if (SEQ_LEQ(th->th_ack, tp->snd_una)) { if (tlen == 0 && tiwin == tp->snd_wnd) { /* * If this is the first time we've seen a * FIN from the remote, this is not a * duplicate and it needs to be processed * normally. This happens during a * simultaneous close. */ if ((thflags & TH_FIN) && (TCPS_HAVERCVDFIN(tp->t_state) == 0)) { tp->t_dupacks = 0; break; } TCPSTAT_INC(tcps_rcvdupack); /* * If we have outstanding data (other than * a window probe), this is a completely * duplicate ack (ie, window info didn't * change and FIN isn't set), * the ack is the biggest we've * seen and we've seen exactly our rexmt * threshold of them, assume a packet * has been dropped and retransmit it. * Kludge snd_nxt & the congestion * window so we send only this one * packet. * * We know we're losing at the current * window size so do congestion avoidance * (set ssthresh to half the current window * and pull our congestion window back to * the new ssthresh). * * Dup acks mean that packets have left the * network (they're now cached at the receiver) * so bump cwnd by the amount in the receiver * to keep a constant cwnd packets in the * network. * * When using TCP ECN, notify the peer that * we reduced the cwnd. */ if (!tcp_timer_active(tp, TT_REXMT) || th->th_ack != tp->snd_una) tp->t_dupacks = 0; else if (++tp->t_dupacks > tcprexmtthresh || IN_FASTRECOVERY(tp->t_flags)) { cc_ack_received(tp, th, nsegs, CC_DUPACK); if ((tp->t_flags & TF_SACK_PERMIT) && IN_FASTRECOVERY(tp->t_flags)) { int awnd; /* * Compute the amount of data in flight first. * We can inject new data into the pipe iff * we have less than 1/2 the original window's * worth of data in flight. */ if (V_tcp_do_rfc6675_pipe) awnd = tcp_compute_pipe(tp); else awnd = (tp->snd_nxt - tp->snd_fack) + tp->sackhint.sack_bytes_rexmit; if (awnd < tp->snd_ssthresh) { tp->snd_cwnd += tp->t_maxseg; if (tp->snd_cwnd > tp->snd_ssthresh) tp->snd_cwnd = tp->snd_ssthresh; } } else tp->snd_cwnd += tp->t_maxseg; (void) tp->t_fb->tfb_tcp_output(tp); goto drop; } else if (tp->t_dupacks == tcprexmtthresh) { tcp_seq onxt = tp->snd_nxt; /* * If we're doing sack, check to * see if we're already in sack * recovery. If we're not doing sack, * check to see if we're in newreno * recovery. */ if (tp->t_flags & TF_SACK_PERMIT) { if (IN_FASTRECOVERY(tp->t_flags)) { tp->t_dupacks = 0; break; } } else { if (SEQ_LEQ(th->th_ack, tp->snd_recover)) { tp->t_dupacks = 0; break; } } /* Congestion signal before ack. */ cc_cong_signal(tp, th, CC_NDUPACK); cc_ack_received(tp, th, nsegs, CC_DUPACK); tcp_timer_activate(tp, TT_REXMT, 0); tp->t_rtttime = 0; if (tp->t_flags & TF_SACK_PERMIT) { TCPSTAT_INC( tcps_sack_recovery_episode); tp->sack_newdata = tp->snd_nxt; tp->snd_cwnd = tp->t_maxseg; (void) tp->t_fb->tfb_tcp_output(tp); goto drop; } tp->snd_nxt = th->th_ack; tp->snd_cwnd = tp->t_maxseg; (void) tp->t_fb->tfb_tcp_output(tp); KASSERT(tp->snd_limited <= 2, ("%s: tp->snd_limited too big", __func__)); tp->snd_cwnd = tp->snd_ssthresh + tp->t_maxseg * (tp->t_dupacks - tp->snd_limited); if (SEQ_GT(onxt, tp->snd_nxt)) tp->snd_nxt = onxt; goto drop; } else if (V_tcp_do_rfc3042) { /* * Process first and second duplicate * ACKs. Each indicates a segment * leaving the network, creating room * for more. Make sure we can send a * packet on reception of each duplicate * ACK by increasing snd_cwnd by one * segment. Restore the original * snd_cwnd after packet transmission. */ cc_ack_received(tp, th, nsegs, CC_DUPACK); uint32_t oldcwnd = tp->snd_cwnd; tcp_seq oldsndmax = tp->snd_max; u_int sent; int avail; KASSERT(tp->t_dupacks == 1 || tp->t_dupacks == 2, ("%s: dupacks not 1 or 2", __func__)); if (tp->t_dupacks == 1) tp->snd_limited = 0; tp->snd_cwnd = (tp->snd_nxt - tp->snd_una) + (tp->t_dupacks - tp->snd_limited) * tp->t_maxseg; /* * Only call tcp_output when there * is new data available to be sent. * Otherwise we would send pure ACKs. */ SOCKBUF_LOCK(&so->so_snd); avail = sbavail(&so->so_snd) - (tp->snd_nxt - tp->snd_una); SOCKBUF_UNLOCK(&so->so_snd); if (avail > 0) (void) tp->t_fb->tfb_tcp_output(tp); sent = tp->snd_max - oldsndmax; if (sent > tp->t_maxseg) { KASSERT((tp->t_dupacks == 2 && tp->snd_limited == 0) || (sent == tp->t_maxseg + 1 && tp->t_flags & TF_SENTFIN), ("%s: sent too much", __func__)); tp->snd_limited = 2; } else if (sent > 0) ++tp->snd_limited; tp->snd_cwnd = oldcwnd; goto drop; } } else tp->t_dupacks = 0; break; } KASSERT(SEQ_GT(th->th_ack, tp->snd_una), ("%s: th_ack <= snd_una", __func__)); /* * If the congestion window was inflated to account * for the other side's cached packets, retract it. */ if (IN_FASTRECOVERY(tp->t_flags)) { if (SEQ_LT(th->th_ack, tp->snd_recover)) { if (tp->t_flags & TF_SACK_PERMIT) tcp_sack_partialack(tp, th); else tcp_newreno_partial_ack(tp, th); } else cc_post_recovery(tp, th); } tp->t_dupacks = 0; /* * If we reach this point, ACK is not a duplicate, * i.e., it ACKs something we sent. */ if (tp->t_flags & TF_NEEDSYN) { /* * T/TCP: Connection was half-synchronized, and our * SYN has been ACK'd (so connection is now fully * synchronized). Go to non-starred state, * increment snd_una for ACK of SYN, and check if * we can do window scaling. */ tp->t_flags &= ~TF_NEEDSYN; tp->snd_una++; /* Do window scaling? */ if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == (TF_RCVD_SCALE|TF_REQ_SCALE)) { tp->rcv_scale = tp->request_r_scale; /* Send window already scaled. */ } } process_ACK: INP_WLOCK_ASSERT(tp->t_inpcb); acked = BYTES_THIS_ACK(tp, th); TCPSTAT_INC(tcps_rcvackpack); TCPSTAT_ADD(tcps_rcvackbyte, acked); /* * If we just performed our first retransmit, and the ACK * arrives within our recovery window, then it was a mistake * to do the retransmit in the first place. Recover our * original cwnd and ssthresh, and proceed to transmit where * we left off. */ if (tp->t_rxtshift == 1 && tp->t_flags & TF_PREVVALID && (int)(ticks - tp->t_badrxtwin) < 0) cc_cong_signal(tp, th, CC_RTO_ERR); /* * If we have a timestamp reply, update smoothed * round trip time. If no timestamp is present but * transmit timer is running and timed sequence * number was acked, update smoothed round trip time. * Since we now have an rtt measurement, cancel the * timer backoff (cf., Phil Karn's retransmit alg.). * Recompute the initial retransmit timer. * * Some boxes send broken timestamp replies * during the SYN+ACK phase, ignore * timestamps of 0 or we could calculate a * huge RTT and blow up the retransmit timer. */ if ((to->to_flags & TOF_TS) != 0 && to->to_tsecr) { uint32_t t; t = tcp_ts_getticks() - to->to_tsecr; if (!tp->t_rttlow || tp->t_rttlow > t) tp->t_rttlow = t; tcp_xmit_timer(tp, TCP_TS_TO_TICKS(t) + 1); } else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) { if (!tp->t_rttlow || tp->t_rttlow > ticks - tp->t_rtttime) tp->t_rttlow = ticks - tp->t_rtttime; tcp_xmit_timer(tp, ticks - tp->t_rtttime); } /* * If all outstanding data is acked, stop retransmit * timer and remember to restart (more output or persist). * If there is more data to be acked, restart retransmit * timer, using current (possibly backed-off) value. */ if (th->th_ack == tp->snd_max) { tcp_timer_activate(tp, TT_REXMT, 0); needoutput = 1; } else if (!tcp_timer_active(tp, TT_PERSIST)) tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur); /* * If no data (only SYN) was ACK'd, * skip rest of ACK processing. */ if (acked == 0) goto step6; /* * Let the congestion control algorithm update congestion * control related information. This typically means increasing * the congestion window. */ cc_ack_received(tp, th, nsegs, CC_ACK); SOCKBUF_LOCK(&so->so_snd); if (acked > sbavail(&so->so_snd)) { tp->snd_wnd -= sbavail(&so->so_snd); mfree = sbcut_locked(&so->so_snd, (int)sbavail(&so->so_snd)); ourfinisacked = 1; } else { mfree = sbcut_locked(&so->so_snd, acked); tp->snd_wnd -= acked; ourfinisacked = 0; } /* NB: sowwakeup_locked() does an implicit unlock. */ sowwakeup_locked(so); m_freem(mfree); /* Detect una wraparound. */ if (!IN_RECOVERY(tp->t_flags) && SEQ_GT(tp->snd_una, tp->snd_recover) && SEQ_LEQ(th->th_ack, tp->snd_recover)) tp->snd_recover = th->th_ack - 1; /* XXXLAS: Can this be moved up into cc_post_recovery? */ if (IN_RECOVERY(tp->t_flags) && SEQ_GEQ(th->th_ack, tp->snd_recover)) { EXIT_RECOVERY(tp->t_flags); } tp->snd_una = th->th_ack; if (tp->t_flags & TF_SACK_PERMIT) { if (SEQ_GT(tp->snd_una, tp->snd_recover)) tp->snd_recover = tp->snd_una; } if (SEQ_LT(tp->snd_nxt, tp->snd_una)) tp->snd_nxt = tp->snd_una; switch (tp->t_state) { /* * In FIN_WAIT_1 STATE in addition to the processing * for the ESTABLISHED state if our FIN is now acknowledged * then enter FIN_WAIT_2. */ case TCPS_FIN_WAIT_1: if (ourfinisacked) { /* * If we can't receive any more * data, then closing user can proceed. * Starting the timer is contrary to the * specification, but if we don't get a FIN * we'll hang forever. * * XXXjl: * we should release the tp also, and use a * compressed state. */ if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { soisdisconnected(so); tcp_timer_activate(tp, TT_2MSL, (tcp_fast_finwait2_recycle ? tcp_finwait2_timeout : TP_MAXIDLE(tp))); } tcp_state_change(tp, TCPS_FIN_WAIT_2); } break; /* * In CLOSING STATE in addition to the processing for * the ESTABLISHED state if the ACK acknowledges our FIN * then enter the TIME-WAIT state, otherwise ignore * the segment. */ case TCPS_CLOSING: if (ourfinisacked) { INP_INFO_RLOCK_ASSERT(&V_tcbinfo); tcp_twstart(tp); INP_INFO_RUNLOCK(&V_tcbinfo); m_freem(m); return; } break; /* * In LAST_ACK, we may still be waiting for data to drain * and/or to be acked, as well as for the ack of our FIN. * If our FIN is now acknowledged, delete the TCB, * enter the closed state and return. */ case TCPS_LAST_ACK: if (ourfinisacked) { INP_INFO_RLOCK_ASSERT(&V_tcbinfo); tp = tcp_close(tp); goto drop; } break; } } step6: INP_WLOCK_ASSERT(tp->t_inpcb); /* * Update window information. * Don't look at window if no ACK: TAC's send garbage on first SYN. */ if ((thflags & TH_ACK) && (SEQ_LT(tp->snd_wl1, th->th_seq) || (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) || (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) { /* keep track of pure window updates */ if (tlen == 0 && tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) TCPSTAT_INC(tcps_rcvwinupd); tp->snd_wnd = tiwin; tp->snd_wl1 = th->th_seq; tp->snd_wl2 = th->th_ack; if (tp->snd_wnd > tp->max_sndwnd) tp->max_sndwnd = tp->snd_wnd; needoutput = 1; } /* * Process segments with URG. */ if ((thflags & TH_URG) && th->th_urp && TCPS_HAVERCVDFIN(tp->t_state) == 0) { /* * This is a kludge, but if we receive and accept * random urgent pointers, we'll crash in * soreceive. It's hard to imagine someone * actually wanting to send this much urgent data. */ SOCKBUF_LOCK(&so->so_rcv); if (th->th_urp + sbavail(&so->so_rcv) > sb_max) { th->th_urp = 0; /* XXX */ thflags &= ~TH_URG; /* XXX */ SOCKBUF_UNLOCK(&so->so_rcv); /* XXX */ goto dodata; /* XXX */ } /* * If this segment advances the known urgent pointer, * then mark the data stream. This should not happen * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since * a FIN has been received from the remote side. * In these states we ignore the URG. * * According to RFC961 (Assigned Protocols), * the urgent pointer points to the last octet * of urgent data. We continue, however, * to consider it to indicate the first octet * of data past the urgent section as the original * spec states (in one of two places). */ if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) { tp->rcv_up = th->th_seq + th->th_urp; so->so_oobmark = sbavail(&so->so_rcv) + (tp->rcv_up - tp->rcv_nxt) - 1; if (so->so_oobmark == 0) so->so_rcv.sb_state |= SBS_RCVATMARK; sohasoutofband(so); tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA); } SOCKBUF_UNLOCK(&so->so_rcv); /* * Remove out of band data so doesn't get presented to user. * This can happen independent of advancing the URG pointer, * but if two URG's are pending at once, some out-of-band * data may creep in... ick. */ if (th->th_urp <= (uint32_t)tlen && !(so->so_options & SO_OOBINLINE)) { /* hdr drop is delayed */ tcp_pulloutofband(so, th, m, drop_hdrlen); } } else { /* * If no out of band data is expected, * pull receive urgent pointer along * with the receive window. */ if (SEQ_GT(tp->rcv_nxt, tp->rcv_up)) tp->rcv_up = tp->rcv_nxt; } dodata: /* XXX */ INP_WLOCK_ASSERT(tp->t_inpcb); /* * Process the segment text, merging it into the TCP sequencing queue, * and arranging for acknowledgment of receipt if necessary. * This process logically involves adjusting tp->rcv_wnd as data * is presented to the user (this happens in tcp_usrreq.c, * case PRU_RCVD). If a FIN has already been received on this * connection then we just ignore the text. */ if ((tlen || (thflags & TH_FIN)) && TCPS_HAVERCVDFIN(tp->t_state) == 0) { tcp_seq save_start = th->th_seq; m_adj(m, drop_hdrlen); /* delayed header drop */ /* * Insert segment which includes th into TCP reassembly queue * with control block tp. Set thflags to whether reassembly now * includes a segment with FIN. This handles the common case * inline (segment is the next to be received on an established * connection, and the queue is empty), avoiding linkage into * and removal from the queue and repetition of various * conversions. * Set DELACK for segments received in order, but ack * immediately when segments are out of order (so * fast retransmit can work). */ if (th->th_seq == tp->rcv_nxt && LIST_EMPTY(&tp->t_segq) && TCPS_HAVEESTABLISHED(tp->t_state)) { if (DELAY_ACK(tp, tlen)) tp->t_flags |= TF_DELACK; else tp->t_flags |= TF_ACKNOW; tp->rcv_nxt += tlen; thflags = th->th_flags & TH_FIN; TCPSTAT_INC(tcps_rcvpack); TCPSTAT_ADD(tcps_rcvbyte, tlen); SOCKBUF_LOCK(&so->so_rcv); if (so->so_rcv.sb_state & SBS_CANTRCVMORE) m_freem(m); else sbappendstream_locked(&so->so_rcv, m, 0); /* NB: sorwakeup_locked() does an implicit unlock. */ sorwakeup_locked(so); } else { /* * XXX: Due to the header drop above "th" is * theoretically invalid by now. Fortunately * m_adj() doesn't actually frees any mbufs * when trimming from the head. */ thflags = tcp_reass(tp, th, &tlen, m); tp->t_flags |= TF_ACKNOW; } if (tlen > 0 && (tp->t_flags & TF_SACK_PERMIT)) tcp_update_sack_list(tp, save_start, save_start + tlen); #if 0 /* * Note the amount of data that peer has sent into * our window, in order to estimate the sender's * buffer size. * XXX: Unused. */ if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt)) len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt); else len = so->so_rcv.sb_hiwat; #endif } else { m_freem(m); thflags &= ~TH_FIN; } /* * If FIN is received ACK the FIN and let the user know * that the connection is closing. */ if (thflags & TH_FIN) { if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { socantrcvmore(so); /* * If connection is half-synchronized * (ie NEEDSYN flag on) then delay ACK, * so it may be piggybacked when SYN is sent. * Otherwise, since we received a FIN then no * more input can be expected, send ACK now. */ if (tp->t_flags & TF_NEEDSYN) tp->t_flags |= TF_DELACK; else tp->t_flags |= TF_ACKNOW; tp->rcv_nxt++; } switch (tp->t_state) { /* * In SYN_RECEIVED and ESTABLISHED STATES * enter the CLOSE_WAIT state. */ case TCPS_SYN_RECEIVED: tp->t_starttime = ticks; /* FALLTHROUGH */ case TCPS_ESTABLISHED: tcp_state_change(tp, TCPS_CLOSE_WAIT); break; /* * If still in FIN_WAIT_1 STATE FIN has not been acked so * enter the CLOSING state. */ case TCPS_FIN_WAIT_1: tcp_state_change(tp, TCPS_CLOSING); break; /* * In FIN_WAIT_2 state enter the TIME_WAIT state, * starting the time-wait timer, turning off the other * standard timers. */ case TCPS_FIN_WAIT_2: INP_INFO_RLOCK_ASSERT(&V_tcbinfo); KASSERT(ti_locked == TI_RLOCKED, ("%s: dodata " "TCP_FIN_WAIT_2 ti_locked: %d", __func__, ti_locked)); tcp_twstart(tp); INP_INFO_RUNLOCK(&V_tcbinfo); return; } } if (ti_locked == TI_RLOCKED) { INP_INFO_RUNLOCK(&V_tcbinfo); } ti_locked = TI_UNLOCKED; #ifdef TCPDEBUG if (so->so_options & SO_DEBUG) tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif TCP_PROBE3(debug__input, tp, th, m); /* * Return any desired output. */ if (needoutput || (tp->t_flags & TF_ACKNOW)) (void) tp->t_fb->tfb_tcp_output(tp); KASSERT(ti_locked == TI_UNLOCKED, ("%s: check_delack ti_locked %d", __func__, ti_locked)); INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(tp->t_inpcb); if (tp->t_flags & TF_DELACK) { tp->t_flags &= ~TF_DELACK; tcp_timer_activate(tp, TT_DELACK, tcp_delacktime); } INP_WUNLOCK(tp->t_inpcb); return; dropafterack: /* * Generate an ACK dropping incoming segment if it occupies * sequence space, where the ACK reflects our state. * * We can now skip the test for the RST flag since all * paths to this code happen after packets containing * RST have been dropped. * * In the SYN-RECEIVED state, don't send an ACK unless the * segment we received passes the SYN-RECEIVED ACK test. * If it fails send a RST. This breaks the loop in the * "LAND" DoS attack, and also prevents an ACK storm * between two listening ports that have been sent forged * SYN segments, each with the source address of the other. */ if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) && (SEQ_GT(tp->snd_una, th->th_ack) || SEQ_GT(th->th_ack, tp->snd_max)) ) { rstreason = BANDLIM_RST_OPENPORT; goto dropwithreset; } #ifdef TCPDEBUG if (so->so_options & SO_DEBUG) tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif TCP_PROBE3(debug__drop, tp, th, m); if (ti_locked == TI_RLOCKED) { INP_INFO_RUNLOCK(&V_tcbinfo); } ti_locked = TI_UNLOCKED; tp->t_flags |= TF_ACKNOW; (void) tp->t_fb->tfb_tcp_output(tp); INP_WUNLOCK(tp->t_inpcb); m_freem(m); return; dropwithreset: if (ti_locked == TI_RLOCKED) { INP_INFO_RUNLOCK(&V_tcbinfo); } ti_locked = TI_UNLOCKED; if (tp != NULL) { tcp_dropwithreset(m, th, tp, tlen, rstreason); INP_WUNLOCK(tp->t_inpcb); } else tcp_dropwithreset(m, th, NULL, tlen, rstreason); return; drop: if (ti_locked == TI_RLOCKED) { INP_INFO_RUNLOCK(&V_tcbinfo); ti_locked = TI_UNLOCKED; } #ifdef INVARIANTS else INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); #endif /* * Drop space held by incoming segment and return. */ #ifdef TCPDEBUG if (tp == NULL || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif TCP_PROBE3(debug__drop, tp, th, m); if (tp != NULL) INP_WUNLOCK(tp->t_inpcb); m_freem(m); } /* * Do fast slow is a combination of the original * tcp_dosegment and a split fastpath, one function * for the fast-ack which also includes allowing fastpath * for window advanced in sequence acks. And also a * sub-function that handles the insequence data. */ void tcp_do_segment_fastslow(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, int drop_hdrlen, int tlen, uint8_t iptos, int ti_locked) { int thflags; uint32_t tiwin; char *s; uint16_t nsegs; int can_enter; struct in_conninfo *inc; struct tcpopt to; thflags = th->th_flags; inc = &tp->t_inpcb->inp_inc; nsegs = max(1, m->m_pkthdr.lro_nsegs); /* * If this is either a state-changing packet or current state isn't * established, we require a write lock on tcbinfo. Otherwise, we * allow the tcbinfo to be in either alocked or unlocked, as the * caller may have unnecessarily acquired a write lock due to a race. */ if ((thflags & (TH_SYN | TH_FIN | TH_RST)) != 0 || tp->t_state != TCPS_ESTABLISHED) { KASSERT(ti_locked == TI_RLOCKED, ("%s ti_locked %d for " "SYN/FIN/RST/!EST", __func__, ti_locked)); INP_INFO_RLOCK_ASSERT(&V_tcbinfo); } else { #ifdef INVARIANTS if (ti_locked == TI_RLOCKED) { INP_INFO_RLOCK_ASSERT(&V_tcbinfo); } else { KASSERT(ti_locked == TI_UNLOCKED, ("%s: EST " "ti_locked: %d", __func__, ti_locked)); INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); } #endif } INP_WLOCK_ASSERT(tp->t_inpcb); KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN", __func__)); KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT", __func__)); if ((thflags & TH_SYN) && (thflags & TH_FIN) && V_drop_synfin) { if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: " "SYN|FIN segment ignored (based on " "sysctl setting)\n", s, __func__); free(s, M_TCPLOG); } if (ti_locked == TI_RLOCKED) { INP_INFO_RUNLOCK(&V_tcbinfo); } INP_WUNLOCK(tp->t_inpcb); m_freem(m); return; } /* * If a segment with the ACK-bit set arrives in the SYN-SENT state * check SEQ.ACK first. */ if ((tp->t_state == TCPS_SYN_SENT) && (thflags & TH_ACK) && (SEQ_LEQ(th->th_ack, tp->iss) || SEQ_GT(th->th_ack, tp->snd_max))) { tcp_dropwithreset(m, th, tp, tlen, BANDLIM_UNLIMITED); if (ti_locked == TI_RLOCKED) { INP_INFO_RUNLOCK(&V_tcbinfo); } INP_WUNLOCK(tp->t_inpcb); return; } tp->sackhint.last_sack_ack = 0; /* * Segment received on connection. * Reset idle time and keep-alive timer. * XXX: This should be done after segment * validation to ignore broken/spoofed segs. */ tp->t_rcvtime = ticks; /* * Unscale the window into a 32-bit value. * For the SYN_SENT state the scale is zero. */ tiwin = th->th_win << tp->snd_scale; /* * TCP ECN processing. */ if (tp->t_flags & TF_ECN_PERMIT) { if (thflags & TH_CWR) tp->t_flags &= ~TF_ECN_SND_ECE; switch (iptos & IPTOS_ECN_MASK) { case IPTOS_ECN_CE: tp->t_flags |= TF_ECN_SND_ECE; TCPSTAT_INC(tcps_ecn_ce); break; case IPTOS_ECN_ECT0: TCPSTAT_INC(tcps_ecn_ect0); break; case IPTOS_ECN_ECT1: TCPSTAT_INC(tcps_ecn_ect1); break; } /* Congestion experienced. */ if (thflags & TH_ECE) { cc_cong_signal(tp, th, CC_ECN); } } /* * Parse options on any incoming segment. */ tcp_dooptions(&to, (u_char *)(th + 1), (th->th_off << 2) - sizeof(struct tcphdr), (thflags & TH_SYN) ? TO_SYN : 0); /* * If echoed timestamp is later than the current time, * fall back to non RFC1323 RTT calculation. Normalize * timestamp if syncookies were used when this connection * was established. */ if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) { to.to_tsecr -= tp->ts_offset; if (TSTMP_GT(to.to_tsecr, tcp_ts_getticks())) to.to_tsecr = 0; } /* * Process options only when we get SYN/ACK back. The SYN case * for incoming connections is handled in tcp_syncache. * According to RFC1323 the window field in a SYN (i.e., a * or ) segment itself is never scaled. * XXX this is traditional behavior, may need to be cleaned up. */ if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) { if ((to.to_flags & TOF_SCALE) && (tp->t_flags & TF_REQ_SCALE)) { tp->t_flags |= TF_RCVD_SCALE; tp->snd_scale = to.to_wscale; } /* * Initial send window. It will be updated with * the next incoming segment to the scaled value. */ tp->snd_wnd = th->th_win; if (to.to_flags & TOF_TS) { tp->t_flags |= TF_RCVD_TSTMP; tp->ts_recent = to.to_tsval; tp->ts_recent_age = tcp_ts_getticks(); } if (to.to_flags & TOF_MSS) tcp_mss(tp, to.to_mss); if ((tp->t_flags & TF_SACK_PERMIT) && (to.to_flags & TOF_SACKPERM) == 0) tp->t_flags &= ~TF_SACK_PERMIT; } /* * If timestamps were negotiated during SYN/ACK they should * appear on every segment during this session and vice versa. */ if ((tp->t_flags & TF_RCVD_TSTMP) && !(to.to_flags & TOF_TS)) { if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: Timestamp missing, " "no action\n", s, __func__); free(s, M_TCPLOG); } } if (!(tp->t_flags & TF_RCVD_TSTMP) && (to.to_flags & TOF_TS)) { if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: Timestamp not expected, " "no action\n", s, __func__); free(s, M_TCPLOG); } } can_enter = 0; if (__predict_true((tlen == 0))) { /* * The ack moved forward and we have a window (non-zero) * * The ack did not move forward, but the window increased. */ if (__predict_true((SEQ_GT(th->th_ack, tp->snd_una) && tiwin) || ((th->th_ack == tp->snd_una) && tiwin && (tiwin > tp->snd_wnd)))) { can_enter = 1; } } else { /* * Data incoming, use the old entry criteria * for fast-path with data. */ if ((tiwin && tiwin == tp->snd_wnd)) { can_enter = 1; } } /* * Header prediction: check for the two common cases * of a uni-directional data xfer. If the packet has * no control flags, is in-sequence, the window didn't * change and we're not retransmitting, it's a * candidate. If the length is zero and the ack moved * forward, we're the sender side of the xfer. Just * free the data acked & wake any higher level process * that was blocked waiting for space. If the length * is non-zero and the ack didn't move, we're the * receiver side. If we're getting packets in-order * (the reassembly queue is empty), add the data to * the socket buffer and note that we need a delayed ack. * Make sure that the hidden state-flags are also off. * Since we check for TCPS_ESTABLISHED first, it can only * be TH_NEEDSYN. */ if (__predict_true(tp->t_state == TCPS_ESTABLISHED && th->th_seq == tp->rcv_nxt && (thflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK && tp->snd_nxt == tp->snd_max && can_enter && ((tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN)) == 0) && LIST_EMPTY(&tp->t_segq) && ((to.to_flags & TOF_TS) == 0 || TSTMP_GEQ(to.to_tsval, tp->ts_recent)))) { if (__predict_true((tlen == 0) && (SEQ_LEQ(th->th_ack, tp->snd_max) && !IN_RECOVERY(tp->t_flags) && (to.to_flags & TOF_SACK) == 0 && TAILQ_EMPTY(&tp->snd_holes)))) { /* We are done */ tcp_do_fastack(m, th, so, tp, &to, drop_hdrlen, tlen, ti_locked, tiwin); return; } else if ((tlen) && (th->th_ack == tp->snd_una && tlen <= sbspace(&so->so_rcv))) { tcp_do_fastnewdata(m, th, so, tp, &to, drop_hdrlen, tlen, ti_locked, tiwin); /* We are done */ return; } } tcp_do_slowpath(m, th, so, tp, &to, drop_hdrlen, tlen, ti_locked, tiwin, thflags); } /* * This subfunction is used to try to highly optimize the * fast path. We again allow window updates that are * in sequence to remain in the fast-path. We also add * in the __predict's to attempt to help the compiler. * Note that if we return a 0, then we can *not* process * it and the caller should push the packet into the * slow-path. */ static int tcp_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int drop_hdrlen, int tlen, int ti_locked, uint32_t tiwin) { int acked; uint16_t nsegs; int winup_only=0; nsegs = max(1, m->m_pkthdr.lro_nsegs); #ifdef TCPDEBUG /* * The size of tcp_saveipgen must be the size of the max ip header, * now IPv6. */ u_char tcp_saveipgen[IP6_HDR_LEN]; struct tcphdr tcp_savetcp; short ostate = 0; #endif if (__predict_false(SEQ_LEQ(th->th_ack, tp->snd_una))) { /* Old ack, behind (or duplicate to) the last one rcv'd */ return (0); } if (__predict_false(th->th_ack == tp->snd_una) && __predict_false(tiwin <= tp->snd_wnd)) { /* duplicate ack a shrinking dup ack with shrinking window */ return (0); } if (__predict_false(tiwin == 0)) { /* zero window */ return (0); } if (__predict_false(SEQ_GT(th->th_ack, tp->snd_max))) { /* Above what we have sent? */ return (0); } if (__predict_false(tp->snd_nxt != tp->snd_max)) { /* We are retransmitting */ return (0); } if (__predict_false(tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN))) { /* We need a SYN or a FIN, unlikely.. */ return (0); } if((to->to_flags & TOF_TS) && __predict_false(TSTMP_LT(to->to_tsval, tp->ts_recent))) { /* Timestamp is behind .. old ack with seq wrap? */ return (0); } if (__predict_false(IN_RECOVERY(tp->t_flags))) { /* Still recovering */ return (0); } if (__predict_false(to->to_flags & TOF_SACK)) { /* Sack included in the ack.. */ return (0); } if (!TAILQ_EMPTY(&tp->snd_holes)) { /* We have sack holes on our scoreboard */ return (0); } /* Ok if we reach here, we can process a fast-ack */ /* Did the window get updated? */ if (tiwin != tp->snd_wnd) { /* keep track of pure window updates */ if (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) { winup_only = 1; TCPSTAT_INC(tcps_rcvwinupd); } tp->snd_wnd = tiwin; tp->snd_wl1 = th->th_seq; if (tp->snd_wnd > tp->max_sndwnd) tp->max_sndwnd = tp->snd_wnd; } /* * Pull snd_wl2 up to prevent seq wrap relative * to th_ack. */ tp->snd_wl2 = th->th_ack; /* * If last ACK falls within this segment's sequence numbers, * record the timestamp. * NOTE that the test is modified according to the latest * proposal of the tcplw@cray.com list (Braden 1993/04/26). */ if ((to->to_flags & TOF_TS) != 0 && SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { tp->ts_recent_age = tcp_ts_getticks(); tp->ts_recent = to->to_tsval; } /* * This is a pure ack for outstanding data. */ if (ti_locked == TI_RLOCKED) { INP_INFO_RUNLOCK(&V_tcbinfo); } ti_locked = TI_UNLOCKED; TCPSTAT_INC(tcps_predack); /* * "bad retransmit" recovery. */ if (tp->t_rxtshift == 1 && tp->t_flags & TF_PREVVALID && (int)(ticks - tp->t_badrxtwin) < 0) { cc_cong_signal(tp, th, CC_RTO_ERR); } /* * Recalculate the transmit timer / rtt. * * Some boxes send broken timestamp replies * during the SYN+ACK phase, ignore * timestamps of 0 or we could calculate a * huge RTT and blow up the retransmit timer. */ if ((to->to_flags & TOF_TS) != 0 && to->to_tsecr) { uint32_t t; t = tcp_ts_getticks() - to->to_tsecr; if (!tp->t_rttlow || tp->t_rttlow > t) tp->t_rttlow = t; tcp_xmit_timer(tp, TCP_TS_TO_TICKS(t) + 1); } else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) { if (!tp->t_rttlow || tp->t_rttlow > ticks - tp->t_rtttime) tp->t_rttlow = ticks - tp->t_rtttime; tcp_xmit_timer(tp, ticks - tp->t_rtttime); } if (winup_only == 0) { acked = BYTES_THIS_ACK(tp, th); #ifdef TCP_HHOOK /* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */ hhook_run_tcp_est_in(tp, th, to); #endif TCPSTAT_ADD(tcps_rcvackbyte, acked); sbdrop(&so->so_snd, acked); if (SEQ_GT(tp->snd_una, tp->snd_recover) && SEQ_LEQ(th->th_ack, tp->snd_recover)) tp->snd_recover = th->th_ack - 1; /* * Let the congestion control algorithm update * congestion control related information. This * typically means increasing the congestion * window. */ cc_ack_received(tp, th, nsegs, CC_ACK); tp->snd_una = th->th_ack; tp->t_dupacks = 0; /* * If all outstanding data are acked, stop * retransmit timer, otherwise restart timer * using current (possibly backed-off) value. * If process is waiting for space, * wakeup/selwakeup/signal. If data * are ready to send, let tcp_output * decide between more output or persist. */ #ifdef TCPDEBUG if (so->so_options & SO_DEBUG) tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif TCP_PROBE3(debug__input, tp, th, m); m_freem(m); if (tp->snd_una == tp->snd_max) tcp_timer_activate(tp, TT_REXMT, 0); else if (!tcp_timer_active(tp, TT_PERSIST)) tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur); /* Wake up the socket if we have room to write more */ sowwakeup(so); } else { /* * Window update only, just free the mbufs and * send out whatever we can. */ m_freem(m); } if (sbavail(&so->so_snd)) (void) tcp_output(tp); KASSERT(ti_locked == TI_UNLOCKED, ("%s: check_delack ti_locked %d", __func__, ti_locked)); INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(tp->t_inpcb); if (tp->t_flags & TF_DELACK) { tp->t_flags &= ~TF_DELACK; tcp_timer_activate(tp, TT_DELACK, tcp_delacktime); } INP_WUNLOCK(tp->t_inpcb); return (1); } /* * This tcp-do-segment concentrates on making the fastest * ack processing path. It does not have a fast-path for * data (it possibly could which would then eliminate the * need for fast-slow above). For a content distributor having * large outgoing elephants and very very little coming in * having no fastpath for data does not really help (since you * don't get much data in). The most important thing is * processing ack's quickly and getting the rest of the data * output to the peer as quickly as possible. This routine * seems to be about an overall 3% faster then the old * tcp_do_segment and keeps us in the fast-path for packets * much more (by allowing window updates to also stay in the fastpath). */ void tcp_do_segment_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, int drop_hdrlen, int tlen, uint8_t iptos, int ti_locked) { int thflags; uint32_t tiwin; char *s; struct in_conninfo *inc; struct tcpopt to; thflags = th->th_flags; inc = &tp->t_inpcb->inp_inc; /* * If this is either a state-changing packet or current state isn't * established, we require a write lock on tcbinfo. Otherwise, we * allow the tcbinfo to be in either alocked or unlocked, as the * caller may have unnecessarily acquired a write lock due to a race. */ if ((thflags & (TH_SYN | TH_FIN | TH_RST)) != 0 || tp->t_state != TCPS_ESTABLISHED) { KASSERT(ti_locked == TI_RLOCKED, ("%s ti_locked %d for " "SYN/FIN/RST/!EST", __func__, ti_locked)); INP_INFO_RLOCK_ASSERT(&V_tcbinfo); } else { #ifdef INVARIANTS if (ti_locked == TI_RLOCKED) { INP_INFO_RLOCK_ASSERT(&V_tcbinfo); } else { KASSERT(ti_locked == TI_UNLOCKED, ("%s: EST " "ti_locked: %d", __func__, ti_locked)); INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); } #endif } INP_WLOCK_ASSERT(tp->t_inpcb); KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN", __func__)); KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT", __func__)); if ((thflags & TH_SYN) && (thflags & TH_FIN) && V_drop_synfin) { if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: " "SYN|FIN segment ignored (based on " "sysctl setting)\n", s, __func__); free(s, M_TCPLOG); } if (ti_locked == TI_RLOCKED) { INP_INFO_RUNLOCK(&V_tcbinfo); } INP_WUNLOCK(tp->t_inpcb); m_freem(m); return; } /* * If a segment with the ACK-bit set arrives in the SYN-SENT state * check SEQ.ACK first. */ if ((tp->t_state == TCPS_SYN_SENT) && (thflags & TH_ACK) && (SEQ_LEQ(th->th_ack, tp->iss) || SEQ_GT(th->th_ack, tp->snd_max))) { tcp_dropwithreset(m, th, tp, tlen, BANDLIM_UNLIMITED); if (ti_locked == TI_RLOCKED) { INP_INFO_RUNLOCK(&V_tcbinfo); } INP_WUNLOCK(tp->t_inpcb); return; } tp->sackhint.last_sack_ack = 0; /* * Segment received on connection. * Reset idle time and keep-alive timer. * XXX: This should be done after segment * validation to ignore broken/spoofed segs. */ tp->t_rcvtime = ticks; /* * Unscale the window into a 32-bit value. * For the SYN_SENT state the scale is zero. */ tiwin = th->th_win << tp->snd_scale; /* * TCP ECN processing. */ if (tp->t_flags & TF_ECN_PERMIT) { if (thflags & TH_CWR) tp->t_flags &= ~TF_ECN_SND_ECE; switch (iptos & IPTOS_ECN_MASK) { case IPTOS_ECN_CE: tp->t_flags |= TF_ECN_SND_ECE; TCPSTAT_INC(tcps_ecn_ce); break; case IPTOS_ECN_ECT0: TCPSTAT_INC(tcps_ecn_ect0); break; case IPTOS_ECN_ECT1: TCPSTAT_INC(tcps_ecn_ect1); break; } /* Congestion experienced. */ if (thflags & TH_ECE) { cc_cong_signal(tp, th, CC_ECN); } } /* * Parse options on any incoming segment. */ tcp_dooptions(&to, (u_char *)(th + 1), (th->th_off << 2) - sizeof(struct tcphdr), (thflags & TH_SYN) ? TO_SYN : 0); /* * If echoed timestamp is later than the current time, * fall back to non RFC1323 RTT calculation. Normalize * timestamp if syncookies were used when this connection * was established. */ if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) { to.to_tsecr -= tp->ts_offset; if (TSTMP_GT(to.to_tsecr, tcp_ts_getticks())) to.to_tsecr = 0; } /* * Process options only when we get SYN/ACK back. The SYN case * for incoming connections is handled in tcp_syncache. * According to RFC1323 the window field in a SYN (i.e., a * or ) segment itself is never scaled. * XXX this is traditional behavior, may need to be cleaned up. */ if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) { if ((to.to_flags & TOF_SCALE) && (tp->t_flags & TF_REQ_SCALE)) { tp->t_flags |= TF_RCVD_SCALE; tp->snd_scale = to.to_wscale; } /* * Initial send window. It will be updated with * the next incoming segment to the scaled value. */ tp->snd_wnd = th->th_win; if (to.to_flags & TOF_TS) { tp->t_flags |= TF_RCVD_TSTMP; tp->ts_recent = to.to_tsval; tp->ts_recent_age = tcp_ts_getticks(); } if (to.to_flags & TOF_MSS) tcp_mss(tp, to.to_mss); if ((tp->t_flags & TF_SACK_PERMIT) && (to.to_flags & TOF_SACKPERM) == 0) tp->t_flags &= ~TF_SACK_PERMIT; } /* * If timestamps were negotiated during SYN/ACK they should * appear on every segment during this session and vice versa. */ if ((tp->t_flags & TF_RCVD_TSTMP) && !(to.to_flags & TOF_TS)) { if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: Timestamp missing, " "no action\n", s, __func__); free(s, M_TCPLOG); } } if (!(tp->t_flags & TF_RCVD_TSTMP) && (to.to_flags & TOF_TS)) { if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: Timestamp not expected, " "no action\n", s, __func__); free(s, M_TCPLOG); } } /* * Header prediction: check for the two common cases * of a uni-directional data xfer. If the packet has * no control flags, is in-sequence, the window didn't * change and we're not retransmitting, it's a * candidate. If the length is zero and the ack moved * forward, we're the sender side of the xfer. Just * free the data acked & wake any higher level process * that was blocked waiting for space. If the length * is non-zero and the ack didn't move, we're the * receiver side. If we're getting packets in-order * (the reassembly queue is empty), add the data to * the socket buffer and note that we need a delayed ack. * Make sure that the hidden state-flags are also off. * Since we check for TCPS_ESTABLISHED first, it can only * be TH_NEEDSYN. */ if (__predict_true(tp->t_state == TCPS_ESTABLISHED) && __predict_true(((to.to_flags & TOF_SACK) == 0)) && __predict_true(tlen == 0) && __predict_true((thflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK) && __predict_true(LIST_EMPTY(&tp->t_segq)) && __predict_true(th->th_seq == tp->rcv_nxt)) { if (tcp_fastack(m, th, so, tp, &to, drop_hdrlen, tlen, ti_locked, tiwin)) { return; } } tcp_do_slowpath(m, th, so, tp, &to, drop_hdrlen, tlen, ti_locked, tiwin, thflags); } struct tcp_function_block __tcp_fastslow = { .tfb_tcp_block_name = "fastslow", .tfb_tcp_output = tcp_output, .tfb_tcp_do_segment = tcp_do_segment_fastslow, .tfb_tcp_ctloutput = tcp_default_ctloutput, }; struct tcp_function_block __tcp_fastack = { .tfb_tcp_block_name = "fastack", .tfb_tcp_output = tcp_output, .tfb_tcp_do_segment = tcp_do_segment_fastack, .tfb_tcp_ctloutput = tcp_default_ctloutput }; static int tcp_addfastpaths(module_t mod, int type, void *data) { - int err=0; + int err = 0; switch (type) { case MOD_LOAD: err = register_tcp_functions(&__tcp_fastack, M_WAITOK); if (err) { printf("Failed to register fastack module -- err:%d\n", err); return(err); } err = register_tcp_functions(&__tcp_fastslow, M_WAITOK); if (err) { printf("Failed to register fastslow module -- err:%d\n", err); deregister_tcp_functions(&__tcp_fastack, false, true); return(err); } break; case MOD_QUIESCE: if ((__tcp_fastslow.tfb_refcnt) ||( __tcp_fastack.tfb_refcnt)) { return(EBUSY); } err = deregister_tcp_functions(&__tcp_fastack, true, false); err = deregister_tcp_functions(&__tcp_fastslow, true, false); break; case MOD_UNLOAD: err = deregister_tcp_functions(&__tcp_fastack, false, true); err = deregister_tcp_functions(&__tcp_fastslow, false, true); if (err == EBUSY) break; err = 0; break; default: return (EOPNOTSUPP); } return (err); } static moduledata_t new_tcp_fastpaths = { .name = "tcp_fastpaths", .evhand = tcp_addfastpaths, .priv = 0 }; MODULE_VERSION(kern_tcpfastpaths, 1); DECLARE_MODULE(kern_tcpfastpaths, new_tcp_fastpaths, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY); Index: head/sys/netinet/tcp_stacks/rack.c =================================================================== --- head/sys/netinet/tcp_stacks/rack.c (nonexistent) +++ head/sys/netinet/tcp_stacks/rack.c (revision 334804) @@ -0,0 +1,9164 @@ +/*- + * Copyright (c) 2016-2018 + * Netflix Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + */ + +#include +__FBSDID("$FreeBSD$"); + +#include "opt_inet.h" +#include "opt_inet6.h" +#include "opt_ipsec.h" +#include "opt_tcpdebug.h" + +#include +#include +#include +#ifdef TCP_HHOOK +#include +#endif +#include +#include +#include +#include +#include +#include /* for proc0 declaration */ +#include +#include +#include +#include +#ifdef NETFLIX_STATS +#include +#endif +#include +#include +#include +#include +#include + +#include + +#include +#include + +#define TCPSTATES /* for logging */ + +#include +#include +#include +#include +#include /* required for icmp_var.h */ +#include /* for ICMP_BANDLIM */ +#include +#include +#include +#include +#define TCPOUTFLAGS +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef NETFLIX_CWV +#include +#endif +#include +#ifdef TCPDEBUG +#include +#endif /* TCPDEBUG */ +#ifdef TCP_OFFLOAD +#include +#endif +#ifdef INET6 +#include +#endif + +#include + +#if defined(IPSEC) || defined(IPSEC_SUPPORT) +#include +#include +#endif /* IPSEC */ + +#include +#include +#include + +#ifdef MAC +#include +#endif +#include "sack_filter.h" +#include "tcp_rack.h" +#include "rack_bbr_common.h" + +uma_zone_t rack_zone; +uma_zone_t rack_pcb_zone; + +#ifndef TICKS2SBT +#define TICKS2SBT(__t) (tick_sbt * ((sbintime_t)(__t))) +#endif + +struct sysctl_ctx_list rack_sysctl_ctx; +struct sysctl_oid *rack_sysctl_root; + +#ifndef TCPHPTS +fatal error missing option TCPHSTS in the build; +#endif + +#define CUM_ACKED 1 +#define SACKED 2 + +/* + * The RACK module incorporates a number of + * TCP ideas that have been put out into the IETF + * over the last few years: + * - Matt Mathis's Rate Halving which slowly drops + * the congestion window so that the ack clock can + * be maintained during a recovery. + * - Yuchung Cheng's RACK TCP (for which its named) that + * will stop us using the number of dup acks and instead + * use time as the gage of when we retransmit. + * - Reorder Detection of RFC4737 and the Tail-Loss probe draft + * of Dukkipati et.al. + * RACK depends on SACK, so if an endpoint arrives that + * cannot do SACK the state machine below will shuttle the + * connection back to using the "default" TCP stack that is + * in FreeBSD. + * + * To implement RACK the original TCP stack was first decomposed + * into a functional state machine with individual states + * for each of the possible TCP connection states. The do_segement + * functions role in life is to mandate the connection supports SACK + * initially and then assure that the RACK state matches the conenction + * state before calling the states do_segment function. Each + * state is simplified due to the fact that the original do_segment + * has been decomposed and we *know* what state we are in (no + * switches on the state) and all tests for SACK are gone. This + * greatly simplifies what each state does. + * + * TCP output is also over-written with a new version since it + * must maintain the new rack scoreboard. + * + */ +static int32_t rack_precache = 1; +static int32_t rack_tlp_thresh = 1; +static int32_t rack_reorder_thresh = 2; +static int32_t rack_reorder_fade = 60000; /* 0 - never fade, def 60,000 + * - 60 seconds */ +static int32_t rack_pkt_delay = 1; +static int32_t rack_inc_var = 0;/* For TLP */ +static int32_t rack_reduce_largest_on_idle = 0; +static int32_t rack_min_pace_time = 0; +static int32_t rack_min_pace_time_seg_req=6; +static int32_t rack_early_recovery = 1; +static int32_t rack_early_recovery_max_seg = 6; +static int32_t rack_send_a_lot_in_prr = 1; +static int32_t rack_min_to = 1; /* Number of ms minimum timeout */ +static int32_t rack_tlp_in_recovery = 1; /* Can we do TLP in recovery? */ +static int32_t rack_verbose_logging = 0; +static int32_t rack_ignore_data_after_close = 1; +/* + * Currently regular tcp has a rto_min of 30ms + * the backoff goes 12 times so that ends up + * being a total of 122.850 seconds before a + * connection is killed. + */ +static int32_t rack_tlp_min = 10; +static int32_t rack_rto_min = 30; /* 30ms same as main freebsd */ +static int32_t rack_rto_max = 30000; /* 30 seconds */ +static const int32_t rack_free_cache = 2; +static int32_t rack_hptsi_segments = 40; +static int32_t rack_rate_sample_method = USE_RTT_LOW; +static int32_t rack_pace_every_seg = 1; +static int32_t rack_delayed_ack_time = 200; /* 200ms */ +static int32_t rack_slot_reduction = 4; +static int32_t rack_lower_cwnd_at_tlp = 0; +static int32_t rack_use_proportional_reduce = 0; +static int32_t rack_proportional_rate = 10; +static int32_t rack_tlp_max_resend = 2; +static int32_t rack_limited_retran = 0; +static int32_t rack_always_send_oldest = 0; +static int32_t rack_sack_block_limit = 128; +static int32_t rack_use_sack_filter = 1; +static int32_t rack_tlp_threshold_use = TLP_USE_TWO_ONE; + +/* Rack specific counters */ +counter_u64_t rack_badfr; +counter_u64_t rack_badfr_bytes; +counter_u64_t rack_rtm_prr_retran; +counter_u64_t rack_rtm_prr_newdata; +counter_u64_t rack_timestamp_mismatch; +counter_u64_t rack_reorder_seen; +counter_u64_t rack_paced_segments; +counter_u64_t rack_unpaced_segments; +counter_u64_t rack_saw_enobuf; +counter_u64_t rack_saw_enetunreach; + +/* Tail loss probe counters */ +counter_u64_t rack_tlp_tot; +counter_u64_t rack_tlp_newdata; +counter_u64_t rack_tlp_retran; +counter_u64_t rack_tlp_retran_bytes; +counter_u64_t rack_tlp_retran_fail; +counter_u64_t rack_to_tot; +counter_u64_t rack_to_arm_rack; +counter_u64_t rack_to_arm_tlp; +counter_u64_t rack_to_alloc; +counter_u64_t rack_to_alloc_hard; +counter_u64_t rack_to_alloc_emerg; + +counter_u64_t rack_sack_proc_all; +counter_u64_t rack_sack_proc_short; +counter_u64_t rack_sack_proc_restart; +counter_u64_t rack_runt_sacks; +counter_u64_t rack_used_tlpmethod; +counter_u64_t rack_used_tlpmethod2; +counter_u64_t rack_enter_tlp_calc; +counter_u64_t rack_input_idle_reduces; +counter_u64_t rack_tlp_does_nada; + +/* Temp CPU counters */ +counter_u64_t rack_find_high; + +counter_u64_t rack_progress_drops; +counter_u64_t rack_out_size[TCP_MSS_ACCT_SIZE]; +counter_u64_t rack_opts_arry[RACK_OPTS_SIZE]; + +static void +rack_log_progress_event(struct tcp_rack *rack, struct tcpcb *tp, uint32_t tick, int event, int line); + +static int +rack_process_ack(struct mbuf *m, struct tcphdr *th, + struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t * ti_locked, + uint32_t tiwin, int32_t tlen, int32_t * ofia, int32_t thflags, int32_t * ret_val); +static int +rack_process_data(struct mbuf *m, struct tcphdr *th, + struct socket *so, struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, + int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt); +static void +rack_ack_received(struct tcpcb *tp, struct tcp_rack *rack, + struct tcphdr *th, uint16_t nsegs, uint16_t type, int32_t recovery); +static struct rack_sendmap *rack_alloc(struct tcp_rack *rack); +static struct rack_sendmap * +rack_check_recovery_mode(struct tcpcb *tp, + uint32_t tsused); +static void +rack_cong_signal(struct tcpcb *tp, struct tcphdr *th, + uint32_t type); +static void rack_counter_destroy(void); +static int +rack_ctloutput(struct socket *so, struct sockopt *sopt, + struct inpcb *inp, struct tcpcb *tp); +static int32_t rack_ctor(void *mem, int32_t size, void *arg, int32_t how); +static void +rack_do_segment(struct mbuf *m, struct tcphdr *th, + struct socket *so, struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, + uint8_t iptos, int32_t ti_locked); +static void rack_dtor(void *mem, int32_t size, void *arg); +static void +rack_earlier_retran(struct tcpcb *tp, struct rack_sendmap *rsm, + uint32_t t, uint32_t cts); +static struct rack_sendmap * +rack_find_high_nonack(struct tcp_rack *rack, + struct rack_sendmap *rsm); +static struct rack_sendmap *rack_find_lowest_rsm(struct tcp_rack *rack); +static void rack_free(struct tcp_rack *rack, struct rack_sendmap *rsm); +static void rack_fini(struct tcpcb *tp, int32_t tcb_is_purged); +static int +rack_get_sockopt(struct socket *so, struct sockopt *sopt, + struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack); +static int32_t rack_handoff_ok(struct tcpcb *tp); +static int32_t rack_init(struct tcpcb *tp); +static void rack_init_sysctls(void); +static void +rack_log_ack(struct tcpcb *tp, struct tcpopt *to, + struct tcphdr *th); +static void +rack_log_output(struct tcpcb *tp, struct tcpopt *to, int32_t len, + uint32_t seq_out, uint8_t th_flags, int32_t err, uint32_t ts, + uint8_t pass, struct rack_sendmap *hintrsm); +static void +rack_log_sack_passed(struct tcpcb *tp, struct tcp_rack *rack, + struct rack_sendmap *rsm); +static void rack_log_to_event(struct tcp_rack *rack, int32_t to_num); +static int32_t rack_output(struct tcpcb *tp); +static void +rack_hpts_do_segment(struct mbuf *m, struct tcphdr *th, + struct socket *so, struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, + uint8_t iptos, int32_t ti_locked, int32_t nxt_pkt, struct timeval *tv); + +static uint32_t +rack_proc_sack_blk(struct tcpcb *tp, struct tcp_rack *rack, + struct sackblk *sack, struct tcpopt *to, struct rack_sendmap **prsm, + uint32_t cts); +static void rack_post_recovery(struct tcpcb *tp, struct tcphdr *th); +static void rack_remxt_tmr(struct tcpcb *tp); +static int +rack_set_sockopt(struct socket *so, struct sockopt *sopt, + struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack); +static void rack_set_state(struct tcpcb *tp, struct tcp_rack *rack); +static int32_t rack_stopall(struct tcpcb *tp); +static void +rack_timer_activate(struct tcpcb *tp, uint32_t timer_type, + uint32_t delta); +static int32_t rack_timer_active(struct tcpcb *tp, uint32_t timer_type); +static void rack_timer_cancel(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int line); +static void rack_timer_stop(struct tcpcb *tp, uint32_t timer_type); +static uint32_t +rack_update_entry(struct tcpcb *tp, struct tcp_rack *rack, + struct rack_sendmap *rsm, uint32_t ts, int32_t * lenp); +static void +rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack, + struct rack_sendmap *rsm, uint32_t ts); +static int +rack_update_rtt(struct tcpcb *tp, struct tcp_rack *rack, + struct rack_sendmap *rsm, struct tcpopt *to, uint32_t cts, int32_t ack_type); +static int32_t tcp_addrack(module_t mod, int32_t type, void *data); +static void +rack_challenge_ack(struct mbuf *m, struct tcphdr *th, + struct tcpcb *tp, int32_t * ti_locked, int32_t * ret_val); +static int +rack_do_close_wait(struct mbuf *m, struct tcphdr *th, + struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, + int32_t tlen, int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt); +static int +rack_do_closing(struct mbuf *m, struct tcphdr *th, + struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, + int32_t tlen, int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt); +static void rack_do_drop(struct mbuf *m, struct tcpcb *tp, int32_t * ti_locked); +static void +rack_do_dropafterack(struct mbuf *m, struct tcpcb *tp, + struct tcphdr *th, int32_t * ti_locked, int32_t thflags, int32_t tlen, int32_t * ret_val); +static void +rack_do_dropwithreset(struct mbuf *m, struct tcpcb *tp, + struct tcphdr *th, int32_t * ti_locked, int32_t rstreason, int32_t tlen); +static int +rack_do_established(struct mbuf *m, struct tcphdr *th, + struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, + int32_t tlen, int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt); +static int +rack_do_fastnewdata(struct mbuf *m, struct tcphdr *th, + struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, + int32_t tlen, int32_t * ti_locked, uint32_t tiwin, int32_t nxt_pkt); +static int +rack_do_fin_wait_1(struct mbuf *m, struct tcphdr *th, + struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, + int32_t tlen, int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt); +static int +rack_do_fin_wait_2(struct mbuf *m, struct tcphdr *th, + struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, + int32_t tlen, int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt); +static int +rack_do_lastack(struct mbuf *m, struct tcphdr *th, + struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, + int32_t tlen, int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt); +static int +rack_do_syn_recv(struct mbuf *m, struct tcphdr *th, + struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, + int32_t tlen, int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt); +static int +rack_do_syn_sent(struct mbuf *m, struct tcphdr *th, + struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, + int32_t tlen, int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt); +static int +rack_drop_checks(struct tcpopt *to, struct mbuf *m, + struct tcphdr *th, struct tcpcb *tp, int32_t * tlenp, int32_t * ti_locked, int32_t * thf, + int32_t * drop_hdrlen, int32_t * ret_val); +static int +rack_process_rst(struct mbuf *m, struct tcphdr *th, + struct socket *so, struct tcpcb *tp, int32_t * ti_locked); +struct rack_sendmap * +tcp_rack_output(struct tcpcb *tp, struct tcp_rack *rack, + uint32_t tsused); +static void tcp_rack_xmit_timer(struct tcp_rack *rack, int32_t rtt); +static void + tcp_rack_partialack(struct tcpcb *tp, struct tcphdr *th); + +static int +rack_ts_check(struct mbuf *m, struct tcphdr *th, + struct tcpcb *tp, int32_t * ti_locked, int32_t tlen, int32_t thflags, int32_t * ret_val); + +int32_t rack_clear_counter=0; + + +static int +sysctl_rack_clear(SYSCTL_HANDLER_ARGS) +{ + uint32_t stat; + int32_t error; + + error = SYSCTL_OUT(req, &rack_clear_counter, sizeof(uint32_t)); + if (error || req->newptr == NULL) + return error; + + error = SYSCTL_IN(req, &stat, sizeof(uint32_t)); + if (error) + return (error); + if (stat == 1) { +#ifdef INVARIANTS + printf("Clearing RACK counters\n"); +#endif + counter_u64_zero(rack_badfr); + counter_u64_zero(rack_badfr_bytes); + counter_u64_zero(rack_rtm_prr_retran); + counter_u64_zero(rack_rtm_prr_newdata); + counter_u64_zero(rack_timestamp_mismatch); + counter_u64_zero(rack_reorder_seen); + counter_u64_zero(rack_tlp_tot); + counter_u64_zero(rack_tlp_newdata); + counter_u64_zero(rack_tlp_retran); + counter_u64_zero(rack_tlp_retran_bytes); + counter_u64_zero(rack_tlp_retran_fail); + counter_u64_zero(rack_to_tot); + counter_u64_zero(rack_to_arm_rack); + counter_u64_zero(rack_to_arm_tlp); + counter_u64_zero(rack_paced_segments); + counter_u64_zero(rack_unpaced_segments); + counter_u64_zero(rack_saw_enobuf); + counter_u64_zero(rack_saw_enetunreach); + counter_u64_zero(rack_to_alloc_hard); + counter_u64_zero(rack_to_alloc_emerg); + counter_u64_zero(rack_sack_proc_all); + counter_u64_zero(rack_sack_proc_short); + counter_u64_zero(rack_sack_proc_restart); + counter_u64_zero(rack_to_alloc); + counter_u64_zero(rack_find_high); + counter_u64_zero(rack_runt_sacks); + counter_u64_zero(rack_used_tlpmethod); + counter_u64_zero(rack_used_tlpmethod2); + counter_u64_zero(rack_enter_tlp_calc); + counter_u64_zero(rack_progress_drops); + counter_u64_zero(rack_tlp_does_nada); + } + rack_clear_counter = 0; + return (0); +} + + + +static void +rack_init_sysctls() +{ + SYSCTL_ADD_S32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_sysctl_root), + OID_AUTO, "rate_sample_method", CTLFLAG_RW, + &rack_rate_sample_method , USE_RTT_LOW, + "What method should we use for rate sampling 0=high, 1=low "); + SYSCTL_ADD_S32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_sysctl_root), + OID_AUTO, "data_after_close", CTLFLAG_RW, + &rack_ignore_data_after_close, 0, + "Do we hold off sending a RST until all pending data is ack'd"); + SYSCTL_ADD_S32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_sysctl_root), + OID_AUTO, "tlpmethod", CTLFLAG_RW, + &rack_tlp_threshold_use, TLP_USE_TWO_ONE, + "What method do we do for TLP time calc 0=no-de-ack-comp, 1=ID, 2=2.1, 3=2.2"); + SYSCTL_ADD_S32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_sysctl_root), + OID_AUTO, "min_pace_time", CTLFLAG_RW, + &rack_min_pace_time, 0, + "Should we enforce a minimum pace time of 1ms"); + SYSCTL_ADD_S32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_sysctl_root), + OID_AUTO, "min_pace_segs", CTLFLAG_RW, + &rack_min_pace_time_seg_req, 6, + "How many segments have to be in the len to enforce min-pace-time"); + SYSCTL_ADD_S32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_sysctl_root), + OID_AUTO, "idle_reduce_high", CTLFLAG_RW, + &rack_reduce_largest_on_idle, 0, + "Should we reduce the largest cwnd seen to IW on idle reduction"); + SYSCTL_ADD_S32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_sysctl_root), + OID_AUTO, "bb_verbose", CTLFLAG_RW, + &rack_verbose_logging, 0, + "Should RACK black box logging be verbose"); + SYSCTL_ADD_S32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_sysctl_root), + OID_AUTO, "sackfiltering", CTLFLAG_RW, + &rack_use_sack_filter, 1, + "Do we use sack filtering?"); + SYSCTL_ADD_S32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_sysctl_root), + OID_AUTO, "delayed_ack", CTLFLAG_RW, + &rack_delayed_ack_time, 200, + "Delayed ack time (200ms)"); + SYSCTL_ADD_S32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_sysctl_root), + OID_AUTO, "tlpminto", CTLFLAG_RW, + &rack_tlp_min, 10, + "TLP minimum timeout per the specification (10ms)"); + SYSCTL_ADD_S32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_sysctl_root), + OID_AUTO, "precache", CTLFLAG_RW, + &rack_precache, 0, + "Where should we precache the mcopy (0 is not at all)"); + SYSCTL_ADD_S32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_sysctl_root), + OID_AUTO, "sblklimit", CTLFLAG_RW, + &rack_sack_block_limit, 128, + "When do we start paying attention to small sack blocks"); + SYSCTL_ADD_S32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_sysctl_root), + OID_AUTO, "send_oldest", CTLFLAG_RW, + &rack_always_send_oldest, 1, + "Should we always send the oldest TLP and RACK-TLP"); + SYSCTL_ADD_S32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_sysctl_root), + OID_AUTO, "rack_tlp_in_recovery", CTLFLAG_RW, + &rack_tlp_in_recovery, 1, + "Can we do a TLP during recovery?"); + SYSCTL_ADD_S32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_sysctl_root), + OID_AUTO, "rack_tlimit", CTLFLAG_RW, + &rack_limited_retran, 0, + "How many times can a rack timeout drive out sends"); + SYSCTL_ADD_S32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_sysctl_root), + OID_AUTO, "minrto", CTLFLAG_RW, + &rack_rto_min, 0, + "Minimum RTO in ms -- set with caution below 1000 due to TLP"); + SYSCTL_ADD_S32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_sysctl_root), + OID_AUTO, "maxrto", CTLFLAG_RW, + &rack_rto_max, 0, + "Maxiumum RTO in ms -- should be at least as large as min_rto"); + SYSCTL_ADD_S32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_sysctl_root), + OID_AUTO, "tlp_retry", CTLFLAG_RW, + &rack_tlp_max_resend, 2, + "How many times does TLP retry a single segment or multiple with no ACK"); + SYSCTL_ADD_S32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_sysctl_root), + OID_AUTO, "recovery_loss_prop", CTLFLAG_RW, + &rack_use_proportional_reduce, 0, + "Should we proportionaly reduce cwnd based on the number of losses "); + SYSCTL_ADD_S32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_sysctl_root), + OID_AUTO, "recovery_prop", CTLFLAG_RW, + &rack_proportional_rate, 10, + "What percent reduction per loss"); + SYSCTL_ADD_S32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_sysctl_root), + OID_AUTO, "tlp_cwnd_flag", CTLFLAG_RW, + &rack_lower_cwnd_at_tlp, 0, + "When a TLP completes a retran should we enter recovery?"); + SYSCTL_ADD_S32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_sysctl_root), + OID_AUTO, "hptsi_reduces", CTLFLAG_RW, + &rack_slot_reduction, 4, + "When setting a slot should we reduce by divisor"); + SYSCTL_ADD_S32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_sysctl_root), + OID_AUTO, "hptsi_every_seg", CTLFLAG_RW, + &rack_pace_every_seg, 1, + "Should we pace out every segment hptsi"); + SYSCTL_ADD_S32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_sysctl_root), + OID_AUTO, "hptsi_seg_max", CTLFLAG_RW, + &rack_hptsi_segments, 6, + "Should we pace out only a limited size of segments"); + SYSCTL_ADD_S32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_sysctl_root), + OID_AUTO, "prr_sendalot", CTLFLAG_RW, + &rack_send_a_lot_in_prr, 1, + "Send a lot in prr"); + SYSCTL_ADD_S32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_sysctl_root), + OID_AUTO, "minto", CTLFLAG_RW, + &rack_min_to, 1, + "Minimum rack timeout in milliseconds"); + SYSCTL_ADD_S32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_sysctl_root), + OID_AUTO, "earlyrecoveryseg", CTLFLAG_RW, + &rack_early_recovery_max_seg, 6, + "Max segments in early recovery"); + SYSCTL_ADD_S32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_sysctl_root), + OID_AUTO, "earlyrecovery", CTLFLAG_RW, + &rack_early_recovery, 1, + "Do we do early recovery with rack"); + SYSCTL_ADD_S32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_sysctl_root), + OID_AUTO, "reorder_thresh", CTLFLAG_RW, + &rack_reorder_thresh, 2, + "What factor for rack will be added when seeing reordering (shift right)"); + SYSCTL_ADD_S32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_sysctl_root), + OID_AUTO, "rtt_tlp_thresh", CTLFLAG_RW, + &rack_tlp_thresh, 1, + "what divisor for TLP rtt/retran will be added (1=rtt, 2=1/2 rtt etc)"); + SYSCTL_ADD_S32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_sysctl_root), + OID_AUTO, "reorder_fade", CTLFLAG_RW, + &rack_reorder_fade, 0, + "Does reorder detection fade, if so how many ms (0 means never)"); + SYSCTL_ADD_S32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_sysctl_root), + OID_AUTO, "pktdelay", CTLFLAG_RW, + &rack_pkt_delay, 1, + "Extra RACK time (in ms) besides reordering thresh"); + SYSCTL_ADD_S32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_sysctl_root), + OID_AUTO, "inc_var", CTLFLAG_RW, + &rack_inc_var, 0, + "Should rack add to the TLP timer the variance in rtt calculation"); + rack_badfr = counter_u64_alloc(M_WAITOK); + SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_sysctl_root), + OID_AUTO, "badfr", CTLFLAG_RD, + &rack_badfr, "Total number of bad FRs"); + rack_badfr_bytes = counter_u64_alloc(M_WAITOK); + SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_sysctl_root), + OID_AUTO, "badfr_bytes", CTLFLAG_RD, + &rack_badfr_bytes, "Total number of bad FRs"); + rack_rtm_prr_retran = counter_u64_alloc(M_WAITOK); + SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_sysctl_root), + OID_AUTO, "prrsndret", CTLFLAG_RD, + &rack_rtm_prr_retran, + "Total number of prr based retransmits"); + rack_rtm_prr_newdata = counter_u64_alloc(M_WAITOK); + SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_sysctl_root), + OID_AUTO, "prrsndnew", CTLFLAG_RD, + &rack_rtm_prr_newdata, + "Total number of prr based new transmits"); + rack_timestamp_mismatch = counter_u64_alloc(M_WAITOK); + SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_sysctl_root), + OID_AUTO, "tsnf", CTLFLAG_RD, + &rack_timestamp_mismatch, + "Total number of timestamps that we could not find the reported ts"); + rack_find_high = counter_u64_alloc(M_WAITOK); + SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_sysctl_root), + OID_AUTO, "findhigh", CTLFLAG_RD, + &rack_find_high, + "Total number of FIN causing find-high"); + rack_reorder_seen = counter_u64_alloc(M_WAITOK); + SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_sysctl_root), + OID_AUTO, "reordering", CTLFLAG_RD, + &rack_reorder_seen, + "Total number of times we added delay due to reordering"); + rack_tlp_tot = counter_u64_alloc(M_WAITOK); + SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_sysctl_root), + OID_AUTO, "tlp_to_total", CTLFLAG_RD, + &rack_tlp_tot, + "Total number of tail loss probe expirations"); + rack_tlp_newdata = counter_u64_alloc(M_WAITOK); + SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_sysctl_root), + OID_AUTO, "tlp_new", CTLFLAG_RD, + &rack_tlp_newdata, + "Total number of tail loss probe sending new data"); + + rack_tlp_retran = counter_u64_alloc(M_WAITOK); + SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_sysctl_root), + OID_AUTO, "tlp_retran", CTLFLAG_RD, + &rack_tlp_retran, + "Total number of tail loss probe sending retransmitted data"); + rack_tlp_retran_bytes = counter_u64_alloc(M_WAITOK); + SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_sysctl_root), + OID_AUTO, "tlp_retran_bytes", CTLFLAG_RD, + &rack_tlp_retran_bytes, + "Total bytes of tail loss probe sending retransmitted data"); + rack_tlp_retran_fail = counter_u64_alloc(M_WAITOK); + SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_sysctl_root), + OID_AUTO, "tlp_retran_fail", CTLFLAG_RD, + &rack_tlp_retran_fail, + "Total number of tail loss probe sending retransmitted data that failed (wait for t3)"); + rack_to_tot = counter_u64_alloc(M_WAITOK); + SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_sysctl_root), + OID_AUTO, "rack_to_tot", CTLFLAG_RD, + &rack_to_tot, + "Total number of times the rack to expired?"); + rack_to_arm_rack = counter_u64_alloc(M_WAITOK); + SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_sysctl_root), + OID_AUTO, "arm_rack", CTLFLAG_RD, + &rack_to_arm_rack, + "Total number of times the rack timer armed?"); + rack_to_arm_tlp = counter_u64_alloc(M_WAITOK); + SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_sysctl_root), + OID_AUTO, "arm_tlp", CTLFLAG_RD, + &rack_to_arm_tlp, + "Total number of times the tlp timer armed?"); + rack_paced_segments = counter_u64_alloc(M_WAITOK); + SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_sysctl_root), + OID_AUTO, "paced", CTLFLAG_RD, + &rack_paced_segments, + "Total number of times a segment send caused hptsi"); + rack_unpaced_segments = counter_u64_alloc(M_WAITOK); + SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_sysctl_root), + OID_AUTO, "unpaced", CTLFLAG_RD, + &rack_unpaced_segments, + "Total number of times a segment did not cause hptsi"); + rack_saw_enobuf = counter_u64_alloc(M_WAITOK); + SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_sysctl_root), + OID_AUTO, "saw_enobufs", CTLFLAG_RD, + &rack_saw_enobuf, + "Total number of times a segment did not cause hptsi"); + rack_saw_enetunreach = counter_u64_alloc(M_WAITOK); + SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_sysctl_root), + OID_AUTO, "saw_enetunreach", CTLFLAG_RD, + &rack_saw_enetunreach, + "Total number of times a segment did not cause hptsi"); + rack_to_alloc = counter_u64_alloc(M_WAITOK); + SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_sysctl_root), + OID_AUTO, "allocs", CTLFLAG_RD, + &rack_to_alloc, + "Total allocations of tracking structures"); + rack_to_alloc_hard = counter_u64_alloc(M_WAITOK); + SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_sysctl_root), + OID_AUTO, "allochard", CTLFLAG_RD, + &rack_to_alloc_hard, + "Total allocations done with sleeping the hard way"); + rack_to_alloc_emerg = counter_u64_alloc(M_WAITOK); + SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_sysctl_root), + OID_AUTO, "allocemerg", CTLFLAG_RD, + &rack_to_alloc_emerg, + "Total alocations done from emergency cache"); + rack_sack_proc_all = counter_u64_alloc(M_WAITOK); + SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_sysctl_root), + OID_AUTO, "sack_long", CTLFLAG_RD, + &rack_sack_proc_all, + "Total times we had to walk whole list for sack processing"); + + rack_sack_proc_restart = counter_u64_alloc(M_WAITOK); + SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_sysctl_root), + OID_AUTO, "sack_restart", CTLFLAG_RD, + &rack_sack_proc_restart, + "Total times we had to walk whole list due to a restart"); + rack_sack_proc_short = counter_u64_alloc(M_WAITOK); + SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_sysctl_root), + OID_AUTO, "sack_short", CTLFLAG_RD, + &rack_sack_proc_short, + "Total times we took shortcut for sack processing"); + rack_enter_tlp_calc = counter_u64_alloc(M_WAITOK); + SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_sysctl_root), + OID_AUTO, "tlp_calc_entered", CTLFLAG_RD, + &rack_enter_tlp_calc, + "Total times we called calc-tlp"); + rack_used_tlpmethod = counter_u64_alloc(M_WAITOK); + SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_sysctl_root), + OID_AUTO, "hit_tlp_method", CTLFLAG_RD, + &rack_used_tlpmethod, + "Total number of runt sacks"); + rack_used_tlpmethod2 = counter_u64_alloc(M_WAITOK); + SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_sysctl_root), + OID_AUTO, "hit_tlp_method2", CTLFLAG_RD, + &rack_used_tlpmethod2, + "Total number of runt sacks 2"); + rack_runt_sacks = counter_u64_alloc(M_WAITOK); + SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_sysctl_root), + OID_AUTO, "runtsacks", CTLFLAG_RD, + &rack_runt_sacks, + "Total number of runt sacks"); + rack_progress_drops = counter_u64_alloc(M_WAITOK); + SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_sysctl_root), + OID_AUTO, "prog_drops", CTLFLAG_RD, + &rack_progress_drops, + "Total number of progress drops"); + rack_input_idle_reduces = counter_u64_alloc(M_WAITOK); + SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_sysctl_root), + OID_AUTO, "idle_reduce_oninput", CTLFLAG_RD, + &rack_input_idle_reduces, + "Total number of idle reductions on input"); + rack_tlp_does_nada = counter_u64_alloc(M_WAITOK); + SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_sysctl_root), + OID_AUTO, "tlp_nada", CTLFLAG_RD, + &rack_tlp_does_nada, + "Total number of nada tlp calls"); + COUNTER_ARRAY_ALLOC(rack_out_size, TCP_MSS_ACCT_SIZE, M_WAITOK); + SYSCTL_ADD_COUNTER_U64_ARRAY(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), + OID_AUTO, "outsize", CTLFLAG_RD, + rack_out_size, TCP_MSS_ACCT_SIZE, "MSS send sizes"); + COUNTER_ARRAY_ALLOC(rack_opts_arry, RACK_OPTS_SIZE, M_WAITOK); + SYSCTL_ADD_COUNTER_U64_ARRAY(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), + OID_AUTO, "opts", CTLFLAG_RD, + rack_opts_arry, RACK_OPTS_SIZE, "RACK Option Stats"); + SYSCTL_ADD_PROC(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_sysctl_root), + OID_AUTO, "clear", CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, + &rack_clear_counter, 0, sysctl_rack_clear, "IU", "Clear counters"); +} + +static inline int32_t +rack_progress_timeout_check(struct tcpcb *tp) +{ + if (tp->t_maxunacktime && tp->t_acktime && TSTMP_GT(ticks, tp->t_acktime)) { + if ((ticks - tp->t_acktime) >= tp->t_maxunacktime) { + /* + * There is an assumption that the caller + * will drop the connection so we will + * increment the counters here. + */ + struct tcp_rack *rack; + rack = (struct tcp_rack *)tp->t_fb_ptr; + counter_u64_add(rack_progress_drops, 1); +#ifdef NETFLIX_STATS + TCPSTAT_INC(tcps_progdrops); +#endif + rack_log_progress_event(rack, tp, ticks, PROGRESS_DROP, __LINE__); + return (1); + } + } + return (0); +} + + +static void +rack_log_to_start(struct tcp_rack *rack, uint32_t cts, uint32_t to, int32_t slot, uint8_t which) +{ + if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { + union tcp_log_stackspecific log; + + memset(&log.u_bbr, 0, sizeof(log.u_bbr)); + log.u_bbr.flex1 = TICKS_2_MSEC(rack->rc_tp->t_srtt >> TCP_RTT_SHIFT); + log.u_bbr.flex2 = to; + log.u_bbr.flex3 = rack->r_ctl.rc_hpts_flags; + log.u_bbr.flex4 = slot; + log.u_bbr.flex5 = rack->rc_inp->inp_hptsslot; + log.u_bbr.flex6 = rack->rc_tp->t_rxtcur; + log.u_bbr.flex8 = which; + log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; + log.u_bbr.ininput = rack->rc_inp->inp_in_input; + TCP_LOG_EVENT(rack->rc_tp, NULL, + &rack->rc_inp->inp_socket->so_rcv, + &rack->rc_inp->inp_socket->so_snd, + BBR_LOG_TIMERSTAR, 0, + 0, &log, false); + } +} + +static void +rack_log_to_event(struct tcp_rack *rack, int32_t to_num) +{ + if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { + union tcp_log_stackspecific log; + + memset(&log.u_bbr, 0, sizeof(log.u_bbr)); + log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; + log.u_bbr.ininput = rack->rc_inp->inp_in_input; + log.u_bbr.flex8 = to_num; + log.u_bbr.flex1 = rack->r_ctl.rc_rack_min_rtt; + log.u_bbr.flex2 = rack->rc_rack_rtt; + TCP_LOG_EVENT(rack->rc_tp, NULL, + &rack->rc_inp->inp_socket->so_rcv, + &rack->rc_inp->inp_socket->so_snd, + BBR_LOG_RTO, 0, + 0, &log, false); + } +} + +static void +rack_log_rtt_upd(struct tcpcb *tp, struct tcp_rack *rack, int32_t t, + uint32_t o_srtt, uint32_t o_var) +{ + if (tp->t_logstate != TCP_LOG_STATE_OFF) { + union tcp_log_stackspecific log; + + memset(&log.u_bbr, 0, sizeof(log.u_bbr)); + log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; + log.u_bbr.ininput = rack->rc_inp->inp_in_input; + log.u_bbr.flex1 = t; + log.u_bbr.flex2 = o_srtt; + log.u_bbr.flex3 = o_var; + log.u_bbr.flex4 = rack->r_ctl.rack_rs.rs_rtt_lowest; + log.u_bbr.flex5 = rack->r_ctl.rack_rs.rs_rtt_highest; + log.u_bbr.flex6 = rack->r_ctl.rack_rs.rs_rtt_cnt; + log.u_bbr.rttProp = rack->r_ctl.rack_rs.rs_rtt_tot; + log.u_bbr.flex8 = rack->r_ctl.rc_rate_sample_method; + TCP_LOG_EVENT(tp, NULL, + &rack->rc_inp->inp_socket->so_rcv, + &rack->rc_inp->inp_socket->so_snd, + BBR_LOG_BBRRTT, 0, + 0, &log, false); + } +} + +static void +rack_log_rtt_sample(struct tcp_rack *rack, uint32_t rtt) +{ + /* + * Log the rtt sample we are + * applying to the srtt algorithm in + * useconds. + */ + if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { + union tcp_log_stackspecific log; + struct timeval tv; + + /* Convert our ms to a microsecond */ + log.u_bbr.flex1 = rtt * 1000; + log.u_bbr.timeStamp = tcp_get_usecs(&tv); + TCP_LOG_EVENTP(rack->rc_tp, NULL, + &rack->rc_inp->inp_socket->so_rcv, + &rack->rc_inp->inp_socket->so_snd, + TCP_LOG_RTT, 0, + 0, &log, false, &tv); + } +} + + +static inline void +rack_log_progress_event(struct tcp_rack *rack, struct tcpcb *tp, uint32_t tick, int event, int line) +{ + if (rack_verbose_logging && (tp->t_logstate != TCP_LOG_STATE_OFF)) { + union tcp_log_stackspecific log; + + memset(&log.u_bbr, 0, sizeof(log.u_bbr)); + log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; + log.u_bbr.ininput = rack->rc_inp->inp_in_input; + log.u_bbr.flex1 = line; + log.u_bbr.flex2 = tick; + log.u_bbr.flex3 = tp->t_maxunacktime; + log.u_bbr.flex4 = tp->t_acktime; + log.u_bbr.flex8 = event; + TCP_LOG_EVENT(tp, NULL, + &rack->rc_inp->inp_socket->so_rcv, + &rack->rc_inp->inp_socket->so_snd, + BBR_LOG_PROGRESS, 0, + 0, &log, false); + } +} + +static void +rack_log_type_bbrsnd(struct tcp_rack *rack, uint32_t len, uint32_t slot, uint32_t cts) +{ + if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { + union tcp_log_stackspecific log; + + memset(&log.u_bbr, 0, sizeof(log.u_bbr)); + log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; + log.u_bbr.ininput = rack->rc_inp->inp_in_input; + log.u_bbr.flex1 = slot; + log.u_bbr.flex7 = (0x0000ffff & rack->r_ctl.rc_hpts_flags); + log.u_bbr.flex8 = rack->rc_in_persist; + TCP_LOG_EVENT(rack->rc_tp, NULL, + &rack->rc_inp->inp_socket->so_rcv, + &rack->rc_inp->inp_socket->so_snd, + BBR_LOG_BBRSND, 0, + 0, &log, false); + } +} + +static void +rack_log_doseg_done(struct tcp_rack *rack, uint32_t cts, int32_t nxt_pkt, int32_t did_out, int way_out) +{ + if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { + union tcp_log_stackspecific log; + log.u_bbr.flex1 = did_out; + log.u_bbr.flex2 = nxt_pkt; + log.u_bbr.flex3 = way_out; + log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags; + log.u_bbr.flex7 = rack->r_wanted_output; + log.u_bbr.flex8 = rack->rc_in_persist; + TCP_LOG_EVENT(rack->rc_tp, NULL, + &rack->rc_inp->inp_socket->so_rcv, + &rack->rc_inp->inp_socket->so_snd, + BBR_LOG_DOSEG_DONE, 0, + 0, &log, false); + } +} + + +static void +rack_log_type_just_return(struct tcp_rack *rack, uint32_t cts, uint32_t tlen, uint32_t slot, uint8_t hpts_calling) +{ + if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { + union tcp_log_stackspecific log; + + memset(&log.u_bbr, 0, sizeof(log.u_bbr)); + log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; + log.u_bbr.ininput = rack->rc_inp->inp_in_input; + log.u_bbr.flex1 = slot; + log.u_bbr.flex2 = rack->r_ctl.rc_hpts_flags; + log.u_bbr.flex7 = hpts_calling; + log.u_bbr.flex8 = rack->rc_in_persist; + TCP_LOG_EVENT(rack->rc_tp, NULL, + &rack->rc_inp->inp_socket->so_rcv, + &rack->rc_inp->inp_socket->so_snd, + BBR_LOG_JUSTRET, 0, + tlen, &log, false); + } +} + +static void +rack_log_to_cancel(struct tcp_rack *rack, int32_t hpts_removed, int line) +{ + if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { + union tcp_log_stackspecific log; + + memset(&log.u_bbr, 0, sizeof(log.u_bbr)); + log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; + log.u_bbr.ininput = rack->rc_inp->inp_in_input; + log.u_bbr.flex1 = line; + log.u_bbr.flex2 = 0; + log.u_bbr.flex3 = rack->r_ctl.rc_hpts_flags; + log.u_bbr.flex4 = 0; + log.u_bbr.flex6 = rack->rc_tp->t_rxtcur; + log.u_bbr.flex8 = hpts_removed; + TCP_LOG_EVENT(rack->rc_tp, NULL, + &rack->rc_inp->inp_socket->so_rcv, + &rack->rc_inp->inp_socket->so_snd, + BBR_LOG_TIMERCANC, 0, + 0, &log, false); + } +} + +static void +rack_log_to_processing(struct tcp_rack *rack, uint32_t cts, int32_t ret, int32_t timers) +{ + if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { + union tcp_log_stackspecific log; + + memset(&log.u_bbr, 0, sizeof(log.u_bbr)); + log.u_bbr.flex1 = timers; + log.u_bbr.flex2 = ret; + log.u_bbr.flex3 = rack->r_ctl.rc_timer_exp; + log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags; + log.u_bbr.flex5 = cts; + TCP_LOG_EVENT(rack->rc_tp, NULL, + &rack->rc_inp->inp_socket->so_rcv, + &rack->rc_inp->inp_socket->so_snd, + BBR_LOG_TO_PROCESS, 0, + 0, &log, false); + } +} + +static void +rack_counter_destroy() +{ + counter_u64_free(rack_badfr); + counter_u64_free(rack_badfr_bytes); + counter_u64_free(rack_rtm_prr_retran); + counter_u64_free(rack_rtm_prr_newdata); + counter_u64_free(rack_timestamp_mismatch); + counter_u64_free(rack_reorder_seen); + counter_u64_free(rack_tlp_tot); + counter_u64_free(rack_tlp_newdata); + counter_u64_free(rack_tlp_retran); + counter_u64_free(rack_tlp_retran_bytes); + counter_u64_free(rack_tlp_retran_fail); + counter_u64_free(rack_to_tot); + counter_u64_free(rack_to_arm_rack); + counter_u64_free(rack_to_arm_tlp); + counter_u64_free(rack_paced_segments); + counter_u64_free(rack_unpaced_segments); + counter_u64_free(rack_saw_enobuf); + counter_u64_free(rack_saw_enetunreach); + counter_u64_free(rack_to_alloc_hard); + counter_u64_free(rack_to_alloc_emerg); + counter_u64_free(rack_sack_proc_all); + counter_u64_free(rack_sack_proc_short); + counter_u64_free(rack_sack_proc_restart); + counter_u64_free(rack_to_alloc); + counter_u64_free(rack_find_high); + counter_u64_free(rack_runt_sacks); + counter_u64_free(rack_enter_tlp_calc); + counter_u64_free(rack_used_tlpmethod); + counter_u64_free(rack_used_tlpmethod2); + counter_u64_free(rack_progress_drops); + counter_u64_free(rack_input_idle_reduces); + counter_u64_free(rack_tlp_does_nada); + COUNTER_ARRAY_FREE(rack_out_size, TCP_MSS_ACCT_SIZE); + COUNTER_ARRAY_FREE(rack_opts_arry, RACK_OPTS_SIZE); +} + +static struct rack_sendmap * +rack_alloc(struct tcp_rack *rack) +{ + struct rack_sendmap *rsm; + + counter_u64_add(rack_to_alloc, 1); + rack->r_ctl.rc_num_maps_alloced++; + rsm = uma_zalloc(rack_zone, M_NOWAIT); + if (rsm) { + return (rsm); + } + if (rack->rc_free_cnt) { + counter_u64_add(rack_to_alloc_emerg, 1); + rsm = TAILQ_FIRST(&rack->r_ctl.rc_free); + TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_next); + rack->rc_free_cnt--; + return (rsm); + } + return (NULL); +} + +static void +rack_free(struct tcp_rack *rack, struct rack_sendmap *rsm) +{ + rack->r_ctl.rc_num_maps_alloced--; + if (rack->r_ctl.rc_tlpsend == rsm) + rack->r_ctl.rc_tlpsend = NULL; + if (rack->r_ctl.rc_next == rsm) + rack->r_ctl.rc_next = NULL; + if (rack->r_ctl.rc_sacklast == rsm) + rack->r_ctl.rc_sacklast = NULL; + if (rack->rc_free_cnt < rack_free_cache) { + memset(rsm, 0, sizeof(struct rack_sendmap)); + TAILQ_INSERT_TAIL(&rack->r_ctl.rc_free, rsm, r_next); + rack->rc_free_cnt++; + return; + } + uma_zfree(rack_zone, rsm); +} + +/* + * CC wrapper hook functions + */ +static void +rack_ack_received(struct tcpcb *tp, struct tcp_rack *rack, struct tcphdr *th, uint16_t nsegs, + uint16_t type, int32_t recovery) +{ +#ifdef NETFLIX_STATS + int32_t gput; +#endif +#ifdef NETFLIX_CWV + u_long old_cwnd = tp->snd_cwnd; +#endif + + INP_WLOCK_ASSERT(tp->t_inpcb); + tp->ccv->nsegs = nsegs; + tp->ccv->bytes_this_ack = BYTES_THIS_ACK(tp, th); + if ((recovery) && (rack->r_ctl.rc_early_recovery_segs)) { + uint32_t max; + + max = rack->r_ctl.rc_early_recovery_segs * tp->t_maxseg; + if (tp->ccv->bytes_this_ack > max) { + tp->ccv->bytes_this_ack = max; + } + } + if (tp->snd_cwnd <= tp->snd_wnd) + tp->ccv->flags |= CCF_CWND_LIMITED; + else + tp->ccv->flags &= ~CCF_CWND_LIMITED; + + if (type == CC_ACK) { +#ifdef NETFLIX_STATS + stats_voi_update_abs_s32(tp->t_stats, VOI_TCP_CALCFRWINDIFF, + ((int32_t) tp->snd_cwnd) - tp->snd_wnd); + if ((tp->t_flags & TF_GPUTINPROG) && + SEQ_GEQ(th->th_ack, tp->gput_ack)) { + gput = (((int64_t) (th->th_ack - tp->gput_seq)) << 3) / + max(1, tcp_ts_getticks() - tp->gput_ts); + stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_GPUT, + gput); + /* + * XXXLAS: This is a temporary hack, and should be + * chained off VOI_TCP_GPUT when stats(9) grows an + * API to deal with chained VOIs. + */ + if (tp->t_stats_gput_prev > 0) + stats_voi_update_abs_s32(tp->t_stats, + VOI_TCP_GPUT_ND, + ((gput - tp->t_stats_gput_prev) * 100) / + tp->t_stats_gput_prev); + tp->t_flags &= ~TF_GPUTINPROG; + tp->t_stats_gput_prev = gput; + + if (tp->t_maxpeakrate) { + /* + * We update t_peakrate_thr. This gives us roughly + * one update per round trip time. + */ + tcp_update_peakrate_thr(tp); + } + } +#endif + if (tp->snd_cwnd > tp->snd_ssthresh) { + tp->t_bytes_acked += min(tp->ccv->bytes_this_ack, + nsegs * V_tcp_abc_l_var * tp->t_maxseg); + if (tp->t_bytes_acked >= tp->snd_cwnd) { + tp->t_bytes_acked -= tp->snd_cwnd; + tp->ccv->flags |= CCF_ABC_SENTAWND; + } + } else { + tp->ccv->flags &= ~CCF_ABC_SENTAWND; + tp->t_bytes_acked = 0; + } + } + if (CC_ALGO(tp)->ack_received != NULL) { + /* XXXLAS: Find a way to live without this */ + tp->ccv->curack = th->th_ack; + CC_ALGO(tp)->ack_received(tp->ccv, type); + } +#ifdef NETFLIX_STATS + stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_LCWIN, tp->snd_cwnd); +#endif + if (rack->r_ctl.rc_rack_largest_cwnd < tp->snd_cwnd) { + rack->r_ctl.rc_rack_largest_cwnd = tp->snd_cwnd; + } +#ifdef NETFLIX_CWV + if (tp->cwv_enabled) { + /* + * Per RFC 7661: The behaviour in the non-validated phase is + * specified as: o A sender determines whether to increase + * the cwnd based upon whether it is cwnd-limited (see + * Section 4.5.3): * A sender that is cwnd-limited MAY use + * the standard TCP method to increase cwnd (i.e., the + * standard method permits a TCP sender that fully utilises + * the cwnd to increase the cwnd each time it receives an + * ACK). * A sender that is not cwnd-limited MUST NOT + * increase the cwnd when ACK packets are received in this + * phase (i.e., needs to avoid growing the cwnd when it has + * not recently sent using the current size of cwnd). + */ + if ((tp->snd_cwnd > old_cwnd) && + (tp->cwv_cwnd_valid == 0) && + (!(tp->ccv->flags & CCF_CWND_LIMITED))) { + tp->snd_cwnd = old_cwnd; + } + /* Try to update pipeAck and NCWV state */ + if (TCPS_HAVEESTABLISHED(tp->t_state) && + !IN_RECOVERY(tp->t_flags)) { + uint32_t data = sbavail(&(tp->t_inpcb->inp_socket->so_snd)); + + tcp_newcwv_update_pipeack(tp, data); + } + } +#endif + /* we enforce max peak rate if it is set. */ + if (tp->t_peakrate_thr && tp->snd_cwnd > tp->t_peakrate_thr) { + tp->snd_cwnd = tp->t_peakrate_thr; + } +} + +static void +tcp_rack_partialack(struct tcpcb *tp, struct tcphdr *th) +{ + struct tcp_rack *rack; + + rack = (struct tcp_rack *)tp->t_fb_ptr; + INP_WLOCK_ASSERT(tp->t_inpcb); + if (rack->r_ctl.rc_prr_sndcnt > 0) + rack->r_wanted_output++; +} + +static void +rack_post_recovery(struct tcpcb *tp, struct tcphdr *th) +{ + struct tcp_rack *rack; + + INP_WLOCK_ASSERT(tp->t_inpcb); + rack = (struct tcp_rack *)tp->t_fb_ptr; + if (CC_ALGO(tp)->post_recovery != NULL) { + tp->ccv->curack = th->th_ack; + CC_ALGO(tp)->post_recovery(tp->ccv); + } + /* + * Here we can in theory adjust cwnd to be based on the number of + * losses in the window (rack->r_ctl.rc_loss_count). This is done + * based on the rack_use_proportional flag. + */ + if (rack->r_ctl.rc_prop_reduce && rack->r_ctl.rc_prop_rate) { + int32_t reduce; + + reduce = (rack->r_ctl.rc_loss_count * rack->r_ctl.rc_prop_rate); + if (reduce > 50) { + reduce = 50; + } + tp->snd_cwnd -= ((reduce * tp->snd_cwnd) / 100); + } else { + if (tp->snd_cwnd > tp->snd_ssthresh) { + /* Drop us down to the ssthresh (1/2 cwnd at loss) */ + tp->snd_cwnd = tp->snd_ssthresh; + } + } + if (rack->r_ctl.rc_prr_sndcnt > 0) { + /* Suck the next prr cnt back into cwnd */ + tp->snd_cwnd += rack->r_ctl.rc_prr_sndcnt; + rack->r_ctl.rc_prr_sndcnt = 0; + } + EXIT_RECOVERY(tp->t_flags); + + +#ifdef NETFLIX_CWV + if (tp->cwv_enabled) { + if ((tp->cwv_cwnd_valid == 0) && + (tp->snd_cwv.in_recovery)) + tcp_newcwv_end_recovery(tp); + } +#endif +} + +static void +rack_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type) +{ + struct tcp_rack *rack; + + INP_WLOCK_ASSERT(tp->t_inpcb); + + rack = (struct tcp_rack *)tp->t_fb_ptr; + switch (type) { + case CC_NDUPACK: +/* rack->r_ctl.rc_ssthresh_set = 1;*/ + if (!IN_FASTRECOVERY(tp->t_flags)) { + rack->r_ctl.rc_tlp_rtx_out = 0; + rack->r_ctl.rc_prr_delivered = 0; + rack->r_ctl.rc_prr_out = 0; + rack->r_ctl.rc_loss_count = 0; + rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg; + rack->r_ctl.rc_prr_recovery_fs = tp->snd_max - tp->snd_una; + tp->snd_recover = tp->snd_max; + if (tp->t_flags & TF_ECN_PERMIT) + tp->t_flags |= TF_ECN_SND_CWR; + } + break; + case CC_ECN: + if (!IN_CONGRECOVERY(tp->t_flags)) { + TCPSTAT_INC(tcps_ecn_rcwnd); + tp->snd_recover = tp->snd_max; + if (tp->t_flags & TF_ECN_PERMIT) + tp->t_flags |= TF_ECN_SND_CWR; + } + break; + case CC_RTO: + tp->t_dupacks = 0; + tp->t_bytes_acked = 0; + EXIT_RECOVERY(tp->t_flags); + tp->snd_ssthresh = max(2, min(tp->snd_wnd, tp->snd_cwnd) / 2 / + tp->t_maxseg) * tp->t_maxseg; + tp->snd_cwnd = tp->t_maxseg; + break; + case CC_RTO_ERR: + TCPSTAT_INC(tcps_sndrexmitbad); + /* RTO was unnecessary, so reset everything. */ + tp->snd_cwnd = tp->snd_cwnd_prev; + tp->snd_ssthresh = tp->snd_ssthresh_prev; + tp->snd_recover = tp->snd_recover_prev; + if (tp->t_flags & TF_WASFRECOVERY) + ENTER_FASTRECOVERY(tp->t_flags); + if (tp->t_flags & TF_WASCRECOVERY) + ENTER_CONGRECOVERY(tp->t_flags); + tp->snd_nxt = tp->snd_max; + tp->t_badrxtwin = 0; + break; + } + + if (CC_ALGO(tp)->cong_signal != NULL) { + if (th != NULL) + tp->ccv->curack = th->th_ack; + CC_ALGO(tp)->cong_signal(tp->ccv, type); + } +#ifdef NETFLIX_CWV + if (tp->cwv_enabled) { + if (tp->snd_cwv.in_recovery == 0 && IN_RECOVERY(tp->t_flags)) { + tcp_newcwv_enter_recovery(tp); + } + if (type == CC_RTO) { + tcp_newcwv_reset(tp); + } + } +#endif +} + + + +static inline void +rack_cc_after_idle(struct tcpcb *tp, int reduce_largest) +{ + uint32_t i_cwnd; + + INP_WLOCK_ASSERT(tp->t_inpcb); + +#ifdef NETFLIX_STATS + TCPSTAT_INC(tcps_idle_restarts); + if (tp->t_state == TCPS_ESTABLISHED) + TCPSTAT_INC(tcps_idle_estrestarts); +#endif + if (CC_ALGO(tp)->after_idle != NULL) + CC_ALGO(tp)->after_idle(tp->ccv); + + if (tp->snd_cwnd == 1) + i_cwnd = tp->t_maxseg; /* SYN(-ACK) lost */ + else if (V_tcp_initcwnd_segments) + i_cwnd = min((V_tcp_initcwnd_segments * tp->t_maxseg), + max(2 * tp->t_maxseg, V_tcp_initcwnd_segments * 1460)); + else if (V_tcp_do_rfc3390) + i_cwnd = min(4 * tp->t_maxseg, + max(2 * tp->t_maxseg, 4380)); + else { + /* Per RFC5681 Section 3.1 */ + if (tp->t_maxseg > 2190) + i_cwnd = 2 * tp->t_maxseg; + else if (tp->t_maxseg > 1095) + i_cwnd = 3 * tp->t_maxseg; + else + i_cwnd = 4 * tp->t_maxseg; + } + if (reduce_largest) { + /* + * Do we reduce the largest cwnd to make + * rack play nice on restart hptsi wise? + */ + if (((struct tcp_rack *)tp->t_fb_ptr)->r_ctl.rc_rack_largest_cwnd > i_cwnd) + ((struct tcp_rack *)tp->t_fb_ptr)->r_ctl.rc_rack_largest_cwnd = i_cwnd; + } + /* + * Being idle is no differnt than the initial window. If the cc + * clamps it down below the initial window raise it to the initial + * window. + */ + if (tp->snd_cwnd < i_cwnd) { + tp->snd_cwnd = i_cwnd; + } +} + + +/* + * Indicate whether this ack should be delayed. We can delay the ack if + * following conditions are met: + * - There is no delayed ack timer in progress. + * - Our last ack wasn't a 0-sized window. We never want to delay + * the ack that opens up a 0-sized window. + * - LRO wasn't used for this segment. We make sure by checking that the + * segment size is not larger than the MSS. + * - Delayed acks are enabled or this is a half-synchronized T/TCP + * connection. + */ +#define DELAY_ACK(tp, tlen) \ + (((tp->t_flags & TF_RXWIN0SENT) == 0) && \ + ((tp->t_flags & TF_DELACK) == 0) && \ + (tlen <= tp->t_maxseg) && \ + (tp->t_delayed_ack || (tp->t_flags & TF_NEEDSYN))) + +static inline void +rack_calc_rwin(struct socket *so, struct tcpcb *tp) +{ + int32_t win; + + /* + * Calculate amount of space in receive window, and then do TCP + * input processing. Receive window is amount of space in rcv queue, + * but not less than advertised window. + */ + win = sbspace(&so->so_rcv); + if (win < 0) + win = 0; + tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt)); +} + +static void +rack_do_drop(struct mbuf *m, struct tcpcb *tp, int32_t * ti_locked) +{ + if (*ti_locked == TI_RLOCKED) { + INP_INFO_RUNLOCK(&V_tcbinfo); + *ti_locked = TI_UNLOCKED; + } + /* + * Drop space held by incoming segment and return. + */ + if (tp != NULL) + INP_WUNLOCK(tp->t_inpcb); + if (m) + m_freem(m); +} + +static void +rack_do_dropwithreset(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th, int32_t * ti_locked, int32_t rstreason, int32_t tlen) +{ + if (*ti_locked == TI_RLOCKED) { + INP_INFO_RUNLOCK(&V_tcbinfo); + *ti_locked = TI_UNLOCKED; + } + if (tp != NULL) { + tcp_dropwithreset(m, th, tp, tlen, rstreason); + INP_WUNLOCK(tp->t_inpcb); + } else + tcp_dropwithreset(m, th, NULL, tlen, rstreason); +} + +/* + * The value in ret_val informs the caller + * if we dropped the tcb (and lock) or not. + * 1 = we dropped it, 0 = the TCB is still locked + * and valid. + */ +static void +rack_do_dropafterack(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th, int32_t * ti_locked, int32_t thflags, int32_t tlen, int32_t * ret_val) +{ + /* + * Generate an ACK dropping incoming segment if it occupies sequence + * space, where the ACK reflects our state. + * + * We can now skip the test for the RST flag since all paths to this + * code happen after packets containing RST have been dropped. + * + * In the SYN-RECEIVED state, don't send an ACK unless the segment + * we received passes the SYN-RECEIVED ACK test. If it fails send a + * RST. This breaks the loop in the "LAND" DoS attack, and also + * prevents an ACK storm between two listening ports that have been + * sent forged SYN segments, each with the source address of the + * other. + */ + struct tcp_rack *rack; + + if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) && + (SEQ_GT(tp->snd_una, th->th_ack) || + SEQ_GT(th->th_ack, tp->snd_max))) { + *ret_val = 1; + rack_do_dropwithreset(m, tp, th, ti_locked, BANDLIM_RST_OPENPORT, tlen); + return; + } else + *ret_val = 0; + if (*ti_locked == TI_RLOCKED) { + INP_INFO_RUNLOCK(&V_tcbinfo); + *ti_locked = TI_UNLOCKED; + } + rack = (struct tcp_rack *)tp->t_fb_ptr; + rack->r_wanted_output++; + tp->t_flags |= TF_ACKNOW; + if (m) + m_freem(m); +} + + +static int +rack_process_rst(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, int32_t * ti_locked) +{ + /* + * RFC5961 Section 3.2 + * + * - RST drops connection only if SEG.SEQ == RCV.NXT. - If RST is in + * window, we send challenge ACK. + * + * Note: to take into account delayed ACKs, we should test against + * last_ack_sent instead of rcv_nxt. Note 2: we handle special case + * of closed window, not covered by the RFC. + */ + int dropped = 0; + + if ((SEQ_GEQ(th->th_seq, (tp->last_ack_sent - 1)) && + SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) || + (tp->rcv_wnd == 0 && tp->last_ack_sent == th->th_seq)) { + + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); + KASSERT(*ti_locked == TI_RLOCKED, + ("%s: TH_RST ti_locked %d, th %p tp %p", + __func__, *ti_locked, th, tp)); + KASSERT(tp->t_state != TCPS_SYN_SENT, + ("%s: TH_RST for TCPS_SYN_SENT th %p tp %p", + __func__, th, tp)); + + if (V_tcp_insecure_rst || + (tp->last_ack_sent == th->th_seq) || + (tp->rcv_nxt == th->th_seq) || + ((tp->last_ack_sent - 1) == th->th_seq)) { + TCPSTAT_INC(tcps_drops); + /* Drop the connection. */ + switch (tp->t_state) { + case TCPS_SYN_RECEIVED: + so->so_error = ECONNREFUSED; + goto close; + case TCPS_ESTABLISHED: + case TCPS_FIN_WAIT_1: + case TCPS_FIN_WAIT_2: + case TCPS_CLOSE_WAIT: + case TCPS_CLOSING: + case TCPS_LAST_ACK: + so->so_error = ECONNRESET; + close: + tcp_state_change(tp, TCPS_CLOSED); + /* FALLTHROUGH */ + default: + tp = tcp_close(tp); + } + dropped = 1; + rack_do_drop(m, tp, ti_locked); + } else { + TCPSTAT_INC(tcps_badrst); + /* Send challenge ACK. */ + tcp_respond(tp, mtod(m, void *), th, m, + tp->rcv_nxt, tp->snd_nxt, TH_ACK); + tp->last_ack_sent = tp->rcv_nxt; + } + } else { + m_freem(m); + } + return (dropped); +} + +/* + * The value in ret_val informs the caller + * if we dropped the tcb (and lock) or not. + * 1 = we dropped it, 0 = the TCB is still locked + * and valid. + */ +static void +rack_challenge_ack(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int32_t * ti_locked, int32_t * ret_val) +{ + KASSERT(*ti_locked == TI_RLOCKED, + ("tcp_do_segment: TH_SYN ti_locked %d", *ti_locked)); + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); + + TCPSTAT_INC(tcps_badsyn); + if (V_tcp_insecure_syn && + SEQ_GEQ(th->th_seq, tp->last_ack_sent) && + SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) { + tp = tcp_drop(tp, ECONNRESET); + *ret_val = 1; + rack_do_drop(m, tp, ti_locked); + } else { + /* Send challenge ACK. */ + tcp_respond(tp, mtod(m, void *), th, m, tp->rcv_nxt, + tp->snd_nxt, TH_ACK); + tp->last_ack_sent = tp->rcv_nxt; + m = NULL; + *ret_val = 0; + rack_do_drop(m, NULL, ti_locked); + } +} + +/* + * rack_ts_check returns 1 for you should not proceed. It places + * in ret_val what should be returned 1/0 by the caller. The 1 indicates + * that the TCB is unlocked and probably dropped. The 0 indicates the + * TCB is still valid and locked. + */ +static int +rack_ts_check(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int32_t * ti_locked, int32_t tlen, int32_t thflags, int32_t * ret_val) +{ + + /* Check to see if ts_recent is over 24 days old. */ + if (tcp_ts_getticks() - tp->ts_recent_age > TCP_PAWS_IDLE) { + /* + * Invalidate ts_recent. If this segment updates ts_recent, + * the age will be reset later and ts_recent will get a + * valid value. If it does not, setting ts_recent to zero + * will at least satisfy the requirement that zero be placed + * in the timestamp echo reply when ts_recent isn't valid. + * The age isn't reset until we get a valid ts_recent + * because we don't want out-of-order segments to be dropped + * when ts_recent is old. + */ + tp->ts_recent = 0; + } else { + TCPSTAT_INC(tcps_rcvduppack); + TCPSTAT_ADD(tcps_rcvdupbyte, tlen); + TCPSTAT_INC(tcps_pawsdrop); + *ret_val = 0; + if (tlen) { + rack_do_dropafterack(m, tp, th, ti_locked, thflags, tlen, ret_val); + } else { + rack_do_drop(m, NULL, ti_locked); + } + return (1); + } + return (0); +} + +/* + * rack_drop_checks returns 1 for you should not proceed. It places + * in ret_val what should be returned 1/0 by the caller. The 1 indicates + * that the TCB is unlocked and probably dropped. The 0 indicates the + * TCB is still valid and locked. + */ +static int +rack_drop_checks(struct tcpopt *to, struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int32_t * tlenp, int32_t * ti_locked, int32_t * thf, int32_t * drop_hdrlen, int32_t * ret_val) +{ + int32_t todrop; + int32_t thflags; + int32_t tlen; + + thflags = *thf; + tlen = *tlenp; + todrop = tp->rcv_nxt - th->th_seq; + if (todrop > 0) { + if (thflags & TH_SYN) { + thflags &= ~TH_SYN; + th->th_seq++; + if (th->th_urp > 1) + th->th_urp--; + else + thflags &= ~TH_URG; + todrop--; + } + /* + * Following if statement from Stevens, vol. 2, p. 960. + */ + if (todrop > tlen + || (todrop == tlen && (thflags & TH_FIN) == 0)) { + /* + * Any valid FIN must be to the left of the window. + * At this point the FIN must be a duplicate or out + * of sequence; drop it. + */ + thflags &= ~TH_FIN; + /* + * Send an ACK to resynchronize and drop any data. + * But keep on processing for RST or ACK. + */ + tp->t_flags |= TF_ACKNOW; + todrop = tlen; + TCPSTAT_INC(tcps_rcvduppack); + TCPSTAT_ADD(tcps_rcvdupbyte, todrop); + } else { + TCPSTAT_INC(tcps_rcvpartduppack); + TCPSTAT_ADD(tcps_rcvpartdupbyte, todrop); + } + *drop_hdrlen += todrop; /* drop from the top afterwards */ + th->th_seq += todrop; + tlen -= todrop; + if (th->th_urp > todrop) + th->th_urp -= todrop; + else { + thflags &= ~TH_URG; + th->th_urp = 0; + } + } + /* + * If segment ends after window, drop trailing data (and PUSH and + * FIN); if nothing left, just ACK. + */ + todrop = (th->th_seq + tlen) - (tp->rcv_nxt + tp->rcv_wnd); + if (todrop > 0) { + TCPSTAT_INC(tcps_rcvpackafterwin); + if (todrop >= tlen) { + TCPSTAT_ADD(tcps_rcvbyteafterwin, tlen); + /* + * If window is closed can only take segments at + * window edge, and have to drop data and PUSH from + * incoming segments. Continue processing, but + * remember to ack. Otherwise, drop segment and + * ack. + */ + if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) { + tp->t_flags |= TF_ACKNOW; + TCPSTAT_INC(tcps_rcvwinprobe); + } else { + rack_do_dropafterack(m, tp, th, ti_locked, thflags, tlen, ret_val); + return (1); + } + } else + TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop); + m_adj(m, -todrop); + tlen -= todrop; + thflags &= ~(TH_PUSH | TH_FIN); + } + *thf = thflags; + *tlenp = tlen; + return (0); +} + +static struct rack_sendmap * +rack_find_lowest_rsm(struct tcp_rack *rack) +{ + struct rack_sendmap *rsm; + + /* + * Walk the time-order transmitted list looking for an rsm that is + * not acked. This will be the one that was sent the longest time + * ago that is still outstanding. + */ + TAILQ_FOREACH(rsm, &rack->r_ctl.rc_tmap, r_tnext) { + if (rsm->r_flags & RACK_ACKED) { + continue; + } + goto finish; + } +finish: + return (rsm); +} + +static struct rack_sendmap * +rack_find_high_nonack(struct tcp_rack *rack, struct rack_sendmap *rsm) +{ + struct rack_sendmap *prsm; + + /* + * Walk the sequence order list backward until we hit and arrive at + * the highest seq not acked. In theory when this is called it + * should be the last segment (which it was not). + */ + counter_u64_add(rack_find_high, 1); + prsm = rsm; + TAILQ_FOREACH_REVERSE_FROM(prsm, &rack->r_ctl.rc_map, rack_head, r_next) { + if (prsm->r_flags & (RACK_ACKED | RACK_HAS_FIN)) { + continue; + } + return (prsm); + } + return (NULL); +} + + +static uint32_t +rack_calc_thresh_rack(struct tcp_rack *rack, uint32_t srtt, uint32_t cts) +{ + int32_t lro; + uint32_t thresh; + + /* + * lro is the flag we use to determine if we have seen reordering. + * If it gets set we have seen reordering. The reorder logic either + * works in one of two ways: + * + * If reorder-fade is configured, then we track the last time we saw + * re-ordering occur. If we reach the point where enough time as + * passed we no longer consider reordering has occuring. + * + * Or if reorder-face is 0, then once we see reordering we consider + * the connection to alway be subject to reordering and just set lro + * to 1. + * + * In the end if lro is non-zero we add the extra time for + * reordering in. + */ + if (srtt == 0) + srtt = 1; + if (rack->r_ctl.rc_reorder_ts) { + if (rack->r_ctl.rc_reorder_fade) { + if (SEQ_GEQ(cts, rack->r_ctl.rc_reorder_ts)) { + lro = cts - rack->r_ctl.rc_reorder_ts; + if (lro == 0) { + /* + * No time as passed since the last + * reorder, mark it as reordering. + */ + lro = 1; + } + } else { + /* Negative time? */ + lro = 0; + } + if (lro > rack->r_ctl.rc_reorder_fade) { + /* Turn off reordering seen too */ + rack->r_ctl.rc_reorder_ts = 0; + lro = 0; + } + } else { + /* Reodering does not fade */ + lro = 1; + } + } else { + lro = 0; + } + thresh = srtt + rack->r_ctl.rc_pkt_delay; + if (lro) { + /* It must be set, if not you get 1/4 rtt */ + if (rack->r_ctl.rc_reorder_shift) + thresh += (srtt >> rack->r_ctl.rc_reorder_shift); + else + thresh += (srtt >> 2); + } else { + thresh += 1; + } + /* We don't let the rack timeout be above a RTO */ + + if (thresh > TICKS_2_MSEC(rack->rc_tp->t_rxtcur)) { + thresh = TICKS_2_MSEC(rack->rc_tp->t_rxtcur); + } + /* And we don't want it above the RTO max either */ + if (thresh > rack_rto_max) { + thresh = rack_rto_max; + } + return (thresh); +} + +static uint32_t +rack_calc_thresh_tlp(struct tcpcb *tp, struct tcp_rack *rack, + struct rack_sendmap *rsm, uint32_t srtt) +{ + struct rack_sendmap *prsm; + uint32_t thresh, len; + int maxseg; + + if (srtt == 0) + srtt = 1; + if (rack->r_ctl.rc_tlp_threshold) + thresh = srtt + (srtt / rack->r_ctl.rc_tlp_threshold); + else + thresh = (srtt * 2); + + /* Get the previous sent packet, if any */ + maxseg = tcp_maxseg(tp); + counter_u64_add(rack_enter_tlp_calc, 1); + len = rsm->r_end - rsm->r_start; + if (rack->rack_tlp_threshold_use == TLP_USE_ID) { + /* Exactly like the ID */ + if (((tp->snd_max - tp->snd_una) - rack->r_ctl.rc_sacked + rack->r_ctl.rc_holes_rxt) <= maxseg) { + uint32_t alt_thresh; + /* + * Compensate for delayed-ack with the d-ack time. + */ + counter_u64_add(rack_used_tlpmethod, 1); + alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time; + if (alt_thresh > thresh) + thresh = alt_thresh; + } + } else if (rack->rack_tlp_threshold_use == TLP_USE_TWO_ONE) { + /* 2.1 behavior */ + prsm = TAILQ_PREV(rsm, rack_head, r_tnext); + if (prsm && (len <= maxseg)) { + /* + * Two packets outstanding, thresh should be (2*srtt) + + * possible inter-packet delay (if any). + */ + uint32_t inter_gap = 0; + int idx, nidx; + + counter_u64_add(rack_used_tlpmethod, 1); + idx = rsm->r_rtr_cnt - 1; + nidx = prsm->r_rtr_cnt - 1; + if (TSTMP_GEQ(rsm->r_tim_lastsent[nidx], prsm->r_tim_lastsent[idx])) { + /* Yes it was sent later (or at the same time) */ + inter_gap = rsm->r_tim_lastsent[idx] - prsm->r_tim_lastsent[nidx]; + } + thresh += inter_gap; + } else if (len <= maxseg) { + /* + * Possibly compensate for delayed-ack. + */ + uint32_t alt_thresh; + + counter_u64_add(rack_used_tlpmethod2, 1); + alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time; + if (alt_thresh > thresh) + thresh = alt_thresh; + } + } else if (rack->rack_tlp_threshold_use == TLP_USE_TWO_TWO) { + /* 2.2 behavior */ + if (len <= maxseg) { + uint32_t alt_thresh; + /* + * Compensate for delayed-ack with the d-ack time. + */ + counter_u64_add(rack_used_tlpmethod, 1); + alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time; + if (alt_thresh > thresh) + thresh = alt_thresh; + } + } + /* Not above an RTO */ + if (thresh > TICKS_2_MSEC(tp->t_rxtcur)) { + thresh = TICKS_2_MSEC(tp->t_rxtcur); + } + /* Not above a RTO max */ + if (thresh > rack_rto_max) { + thresh = rack_rto_max; + } + /* Apply user supplied min TLP */ + if (thresh < rack_tlp_min) { + thresh = rack_tlp_min; + } + return (thresh); +} + +static struct rack_sendmap * +rack_check_recovery_mode(struct tcpcb *tp, uint32_t tsused) +{ + /* + * Check to see that we don't need to fall into recovery. We will + * need to do so if our oldest transmit is past the time we should + * have had an ack. + */ + struct tcp_rack *rack; + struct rack_sendmap *rsm; + int32_t idx; + uint32_t srtt_cur, srtt, thresh; + + rack = (struct tcp_rack *)tp->t_fb_ptr; + if (TAILQ_EMPTY(&rack->r_ctl.rc_map)) { + return (NULL); + } + srtt_cur = tp->t_srtt >> TCP_RTT_SHIFT; + srtt = TICKS_2_MSEC(srtt_cur); + if (rack->rc_rack_rtt && (srtt > rack->rc_rack_rtt)) + srtt = rack->rc_rack_rtt; + + rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); + if (rsm == NULL) + return (NULL); + + if (rsm->r_flags & RACK_ACKED) { + rsm = rack_find_lowest_rsm(rack); + if (rsm == NULL) + return (NULL); + } + idx = rsm->r_rtr_cnt - 1; + thresh = rack_calc_thresh_rack(rack, srtt, tsused); + if (tsused < rsm->r_tim_lastsent[idx]) { + return (NULL); + } + if ((tsused - rsm->r_tim_lastsent[idx]) < thresh) { + return (NULL); + } + /* Ok if we reach here we are over-due */ + rack->r_ctl.rc_rsm_start = rsm->r_start; + rack->r_ctl.rc_cwnd_at = tp->snd_cwnd; + rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh; + rack_cong_signal(tp, NULL, CC_NDUPACK); + return (rsm); +} + +static uint32_t +rack_get_persists_timer_val(struct tcpcb *tp, struct tcp_rack *rack) +{ + int32_t t; + int32_t tt; + uint32_t ret_val; + + t = TICKS_2_MSEC((tp->t_srtt >> TCP_RTT_SHIFT) + ((tp->t_rttvar * 4) >> TCP_RTT_SHIFT)); + TCPT_RANGESET(tt, t * tcp_backoff[tp->t_rxtshift], + tcp_persmin, tcp_persmax); + if (tp->t_rxtshift < TCP_MAXRXTSHIFT) + tp->t_rxtshift++; + rack->r_ctl.rc_hpts_flags |= PACE_TMR_PERSIT; + ret_val = (uint32_t)tt; + return (ret_val); +} + +static uint32_t +rack_timer_start(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) +{ + /* + * Start the FR timer, we do this based on getting the first one in + * the rc_tmap. Note that if its NULL we must stop the timer. in all + * events we need to stop the running timer (if its running) before + * starting the new one. + */ + uint32_t thresh, exp, to, srtt, time_since_sent; + uint32_t srtt_cur; + int32_t idx; + int32_t is_tlp_timer = 0; + struct rack_sendmap *rsm; + + if (rack->t_timers_stopped) { + /* All timers have been stopped none are to run */ + return (0); + } + if (rack->rc_in_persist) { + /* We can't start any timer in persists */ + return (rack_get_persists_timer_val(tp, rack)); + } + rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); + if (rsm == NULL) { + /* Nothing on the send map */ +activate_rxt: + if (SEQ_LT(tp->snd_una, tp->snd_max) || sbavail(&(tp->t_inpcb->inp_socket->so_snd))) { + rack->r_ctl.rc_hpts_flags |= PACE_TMR_RXT; + to = TICKS_2_MSEC(tp->t_rxtcur); + if (to == 0) + to = 1; + return (to); + } + return (0); + } + if (rsm->r_flags & RACK_ACKED) { + rsm = rack_find_lowest_rsm(rack); + if (rsm == NULL) { + /* No lowest? */ + goto activate_rxt; + } + } + /* Convert from ms to usecs */ + if (rsm->r_flags & RACK_SACK_PASSED) { + if ((tp->t_flags & TF_SENTFIN) && + ((tp->snd_max - tp->snd_una) == 1) && + (rsm->r_flags & RACK_HAS_FIN)) { + /* + * We don't start a rack timer if all we have is a + * FIN outstanding. + */ + goto activate_rxt; + } + if (tp->t_srtt) { + srtt_cur = (tp->t_srtt >> TCP_RTT_SHIFT); + srtt = TICKS_2_MSEC(srtt_cur); + } else + srtt = RACK_INITIAL_RTO; + + thresh = rack_calc_thresh_rack(rack, srtt, cts); + idx = rsm->r_rtr_cnt - 1; + exp = rsm->r_tim_lastsent[idx] + thresh; + if (SEQ_GEQ(exp, cts)) { + to = exp - cts; + if (to < rack->r_ctl.rc_min_to) { + to = rack->r_ctl.rc_min_to; + } + } else { + to = rack->r_ctl.rc_min_to; + } + } else { + /* Ok we need to do a TLP not RACK */ + if ((rack->rc_tlp_in_progress != 0) || + (rack->r_ctl.rc_tlp_rtx_out != 0)) { + /* + * The previous send was a TLP or a tlp_rtx is in + * process. + */ + goto activate_rxt; + } + rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_tmap, rack_sendmap, r_tnext); + if (rsm == NULL) { + /* We found no rsm to TLP with. */ + goto activate_rxt; + } + if (rsm->r_flags & RACK_HAS_FIN) { + /* If its a FIN we dont do TLP */ + rsm = NULL; + goto activate_rxt; + } + idx = rsm->r_rtr_cnt - 1; + if (TSTMP_GT(cts, rsm->r_tim_lastsent[idx])) + time_since_sent = cts - rsm->r_tim_lastsent[idx]; + else + time_since_sent = 0; + is_tlp_timer = 1; + if (tp->t_srtt) { + srtt_cur = (tp->t_srtt >> TCP_RTT_SHIFT); + srtt = TICKS_2_MSEC(srtt_cur); + } else + srtt = RACK_INITIAL_RTO; + thresh = rack_calc_thresh_tlp(tp, rack, rsm, srtt); + if (thresh > time_since_sent) + to = thresh - time_since_sent; + else + to = rack->r_ctl.rc_min_to; + if (to > TCPTV_REXMTMAX) { + /* + * If the TLP time works out to larger than the max + * RTO lets not do TLP.. just RTO. + */ + goto activate_rxt; + } + if (rsm->r_start != rack->r_ctl.rc_last_tlp_seq) { + /* + * The tail is no longer the last one I did a probe + * on + */ + rack->r_ctl.rc_tlp_seg_send_cnt = 0; + rack->r_ctl.rc_last_tlp_seq = rsm->r_start; + } + } + if (is_tlp_timer == 0) { + rack->r_ctl.rc_hpts_flags |= PACE_TMR_RACK; + } else { + if ((rack->r_ctl.rc_tlp_send_cnt > rack_tlp_max_resend) || + (rack->r_ctl.rc_tlp_seg_send_cnt > rack_tlp_max_resend)) { + /* + * We have exceeded how many times we can retran the + * current TLP timer, switch to the RTO timer. + */ + goto activate_rxt; + } else { + rack->r_ctl.rc_hpts_flags |= PACE_TMR_TLP; + } + } + if (to == 0) + to = 1; + return (to); +} + +static void +rack_enter_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) +{ + if (rack->rc_in_persist == 0) { + if (((tp->t_flags & TF_SENTFIN) == 0) && + (tp->snd_max - tp->snd_una) >= sbavail(&rack->rc_inp->inp_socket->so_snd)) + /* Must need to send more data to enter persist */ + return; + rack->r_ctl.rc_went_idle_time = cts; + rack_timer_cancel(tp, rack, cts, __LINE__); + tp->t_rxtshift = 0; + rack->rc_in_persist = 1; + } +} + +static void +rack_exit_persist(struct tcpcb *tp, struct tcp_rack *rack) +{ + if (rack->rc_inp->inp_in_hpts) { + tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT); + rack->r_ctl.rc_hpts_flags = 0; + } + rack->rc_in_persist = 0; + rack->r_ctl.rc_went_idle_time = 0; + tp->t_flags &= ~TF_FORCEDATA; + tp->t_rxtshift = 0; +} + +static void +rack_start_hpts_timer(struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts, int32_t line, + int32_t slot, uint32_t tot_len_this_send, int32_t frm_out_sbavail) +{ + struct inpcb *inp; + uint32_t delayed_ack = 0; + uint32_t hpts_timeout; + uint8_t stopped; + uint32_t left = 0; + + inp = tp->t_inpcb; + if (inp->inp_in_hpts) { + /* A previous call is already set up */ + return; + } + if (tp->t_state == TCPS_CLOSED) { + return; + } + stopped = rack->rc_tmr_stopped; + if (stopped && TSTMP_GT(rack->r_ctl.rc_timer_exp, cts)) { + left = rack->r_ctl.rc_timer_exp - cts; + } + rack->r_ctl.rc_timer_exp = 0; + if (rack->rc_inp->inp_in_hpts == 0) { + rack->r_ctl.rc_hpts_flags = 0; + } + if (slot) { + /* We are hptsi too */ + rack->r_ctl.rc_hpts_flags |= PACE_PKT_OUTPUT; + } else if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) { + /* + * We are still left on the hpts when the to goes + * it will be for output. + */ + if (TSTMP_GT(cts, rack->r_ctl.rc_last_output_to)) + slot = cts - rack->r_ctl.rc_last_output_to; + else + slot = 1; + } + if ((tp->snd_wnd == 0) && TCPS_HAVEESTABLISHED(tp->t_state)) { + /* No send window.. we must enter persist */ + rack_enter_persist(tp, rack, cts); + } else if ((frm_out_sbavail && + (frm_out_sbavail > (tp->snd_max - tp->snd_una)) && + (tp->snd_wnd < tp->t_maxseg)) && + TCPS_HAVEESTABLISHED(tp->t_state)) { + /* + * If we have no window or we can't send a segment (and have + * data to send.. we cheat here and frm_out_sbavail is + * passed in with the sbavail(sb) only from bbr_output) and + * we are established, then we must enter persits (if not + * already in persits). + */ + rack_enter_persist(tp, rack, cts); + } + hpts_timeout = rack_timer_start(tp, rack, cts); + if (tp->t_flags & TF_DELACK) { + delayed_ack = tcp_delacktime; + rack->r_ctl.rc_hpts_flags |= PACE_TMR_DELACK; + } + if (delayed_ack && ((hpts_timeout == 0) || + (delayed_ack < hpts_timeout))) + hpts_timeout = delayed_ack; + else + rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_DELACK; + /* + * If no timers are going to run and we will fall off the hptsi + * wheel, we resort to a keep-alive timer if its configured. + */ + if ((hpts_timeout == 0) && + (slot == 0)) { + if ((tcp_always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) && + (tp->t_state <= TCPS_CLOSING)) { + /* + * Ok we have no timer (persists, rack, tlp, rxt or + * del-ack), we don't have segments being paced. So + * all that is left is the keepalive timer. + */ + if (TCPS_HAVEESTABLISHED(tp->t_state)) { + /* Get the established keep-alive time */ + hpts_timeout = TP_KEEPIDLE(tp); + } else { + /* Get the initial setup keep-alive time */ + hpts_timeout = TP_KEEPINIT(tp); + } + rack->r_ctl.rc_hpts_flags |= PACE_TMR_KEEP; + } + } + if (left && (stopped & (PACE_TMR_KEEP | PACE_TMR_DELACK)) == + (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK)) { + /* + * RACK, TLP, persists and RXT timers all are restartable + * based on actions input .. i.e we received a packet (ack + * or sack) and that changes things (rw, or snd_una etc). + * Thus we can restart them with a new value. For + * keep-alive, delayed_ack we keep track of what was left + * and restart the timer with a smaller value. + */ + if (left < hpts_timeout) + hpts_timeout = left; + } + if (hpts_timeout) { + /* + * Hack alert for now we can't time-out over 2,147,483 + * seconds (a bit more than 596 hours), which is probably ok + * :). + */ + if (hpts_timeout > 0x7ffffffe) + hpts_timeout = 0x7ffffffe; + rack->r_ctl.rc_timer_exp = cts + hpts_timeout; + } + if (slot) { + rack->r_ctl.rc_last_output_to = cts + slot; + if ((hpts_timeout == 0) || (hpts_timeout > slot)) { + if (rack->rc_inp->inp_in_hpts == 0) + tcp_hpts_insert(tp->t_inpcb, HPTS_MS_TO_SLOTS(slot)); + rack_log_to_start(rack, cts, hpts_timeout, slot, 1); + } else { + /* + * Arrange for the hpts to kick back in after the + * t-o if the t-o does not cause a send. + */ + if (rack->rc_inp->inp_in_hpts == 0) + tcp_hpts_insert(tp->t_inpcb, HPTS_MS_TO_SLOTS(hpts_timeout)); + rack_log_to_start(rack, cts, hpts_timeout, slot, 0); + } + } else if (hpts_timeout) { + if (rack->rc_inp->inp_in_hpts == 0) + tcp_hpts_insert(tp->t_inpcb, HPTS_MS_TO_SLOTS(hpts_timeout)); + rack_log_to_start(rack, cts, hpts_timeout, slot, 0); + } else { + /* No timer starting */ +#ifdef INVARIANTS + if (SEQ_GT(tp->snd_max, tp->snd_una)) { + panic("tp:%p rack:%p tlts:%d cts:%u slot:%u pto:%u -- no timer started?", + tp, rack, tot_len_this_send, cts, slot, hpts_timeout); + } +#endif + } + rack->rc_tmr_stopped = 0; + if (slot) + rack_log_type_bbrsnd(rack, tot_len_this_send, slot, cts); +} + +/* + * RACK Timer, here we simply do logging and house keeping. + * the normal rack_output() function will call the + * appropriate thing to check if we need to do a RACK retransmit. + * We return 1, saying don't proceed with rack_output only + * when all timers have been stopped (destroyed PCB?). + */ +static int +rack_timeout_rack(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) +{ + /* + * This timer simply provides an internal trigger to send out data. + * The check_recovery_mode call will see if there are needed + * retransmissions, if so we will enter fast-recovery. The output + * call may or may not do the same thing depending on sysctl + * settings. + */ + struct rack_sendmap *rsm; + int32_t recovery; + + if (tp->t_timers->tt_flags & TT_STOPPED) { + return (1); + } + if (TSTMP_LT(cts, rack->r_ctl.rc_timer_exp)) { + /* Its not time yet */ + return (0); + } + rack_log_to_event(rack, RACK_TO_FRM_RACK); + recovery = IN_RECOVERY(tp->t_flags); + counter_u64_add(rack_to_tot, 1); + if (rack->r_state && (rack->r_state != tp->t_state)) + rack_set_state(tp, rack); + rsm = rack_check_recovery_mode(tp, cts); + if (rsm) { + uint32_t rtt; + + rtt = rack->rc_rack_rtt; + if (rtt == 0) + rtt = 1; + if ((recovery == 0) && + (rack->r_ctl.rc_prr_sndcnt < tp->t_maxseg)) { + /* + * The rack-timeout that enter's us into recovery + * will force out one MSS and set us up so that we + * can do one more send in 2*rtt (transitioning the + * rack timeout into a rack-tlp). + */ + rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg; + } else if ((rack->r_ctl.rc_prr_sndcnt < tp->t_maxseg) && + ((rsm->r_end - rsm->r_start) > rack->r_ctl.rc_prr_sndcnt)) { + /* + * When a rack timer goes, we have to send at + * least one segment. They will be paced a min of 1ms + * apart via the next rack timer (or further + * if the rack timer dictates it). + */ + rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg; + } + } else { + /* This is a case that should happen rarely if ever */ + counter_u64_add(rack_tlp_does_nada, 1); +#ifdef TCP_BLACKBOX + tcp_log_dump_tp_logbuf(tp, "nada counter trips", M_NOWAIT, true); +#endif + rack->r_ctl.rc_resend = TAILQ_FIRST(&rack->r_ctl.rc_tmap); + } + rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_RACK; + return (0); +} + +/* + * TLP Timer, here we simply setup what segment we want to + * have the TLP expire on, the normal rack_output() will then + * send it out. + * + * We return 1, saying don't proceed with rack_output only + * when all timers have been stopped (destroyed PCB?). + */ +static int +rack_timeout_tlp(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) +{ + /* + * Tail Loss Probe. + */ + struct rack_sendmap *rsm = NULL; + struct socket *so; + uint32_t amm, old_prr_snd = 0; + uint32_t out, avail; + + if (tp->t_timers->tt_flags & TT_STOPPED) { + return (1); + } + if (TSTMP_LT(cts, rack->r_ctl.rc_timer_exp)) { + /* Its not time yet */ + return (0); + } + if (rack_progress_timeout_check(tp)) { + tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); + return (1); + } + /* + * A TLP timer has expired. We have been idle for 2 rtts. So we now + * need to figure out how to force a full MSS segment out. + */ + rack_log_to_event(rack, RACK_TO_FRM_TLP); + counter_u64_add(rack_tlp_tot, 1); + if (rack->r_state && (rack->r_state != tp->t_state)) + rack_set_state(tp, rack); + so = tp->t_inpcb->inp_socket; + avail = sbavail(&so->so_snd); + out = tp->snd_max - tp->snd_una; + rack->rc_timer_up = 1; + /* + * If we are in recovery we can jazz out a segment if new data is + * present simply by setting rc_prr_sndcnt to a segment. + */ + if ((avail > out) && + ((rack_always_send_oldest == 0) || (TAILQ_EMPTY(&rack->r_ctl.rc_tmap)))) { + /* New data is available */ + amm = avail - out; + if (amm > tp->t_maxseg) { + amm = tp->t_maxseg; + } else if ((amm < tp->t_maxseg) && ((tp->t_flags & TF_NODELAY) == 0)) { + /* not enough to fill a MTU and no-delay is off */ + goto need_retran; + } + if (IN_RECOVERY(tp->t_flags)) { + /* Unlikely */ + old_prr_snd = rack->r_ctl.rc_prr_sndcnt; + if (out + amm <= tp->snd_wnd) + rack->r_ctl.rc_prr_sndcnt = amm; + else + goto need_retran; + } else { + /* Set the send-new override */ + if (out + amm <= tp->snd_wnd) + rack->r_ctl.rc_tlp_new_data = amm; + else + goto need_retran; + } + rack->r_ctl.rc_tlp_seg_send_cnt = 0; + rack->r_ctl.rc_last_tlp_seq = tp->snd_max; + rack->r_ctl.rc_tlpsend = NULL; + counter_u64_add(rack_tlp_newdata, 1); + goto send; + } +need_retran: + /* + * Ok we need to arrange the last un-acked segment to be re-sent, or + * optionally the first un-acked segment. + */ + if (rack_always_send_oldest) + rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); + else { + rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_map, rack_sendmap, r_next); + if (rsm && (rsm->r_flags & (RACK_ACKED | RACK_HAS_FIN))) { + rsm = rack_find_high_nonack(rack, rsm); + } + } + if (rsm == NULL) { + counter_u64_add(rack_tlp_does_nada, 1); +#ifdef TCP_BLACKBOX + tcp_log_dump_tp_logbuf(tp, "nada counter trips", M_NOWAIT, true); +#endif + goto out; + } + if ((rsm->r_end - rsm->r_start) > tp->t_maxseg) { + /* + * We need to split this the last segment in two. + */ + int32_t idx; + struct rack_sendmap *nrsm; + + nrsm = rack_alloc(rack); + if (nrsm == NULL) { + /* + * No memory to split, we will just exit and punt + * off to the RXT timer. + */ + counter_u64_add(rack_tlp_does_nada, 1); + goto out; + } + nrsm->r_start = (rsm->r_end - tp->t_maxseg); + nrsm->r_end = rsm->r_end; + nrsm->r_rtr_cnt = rsm->r_rtr_cnt; + nrsm->r_flags = rsm->r_flags; + nrsm->r_sndcnt = rsm->r_sndcnt; + nrsm->r_rtr_bytes = 0; + rsm->r_end = nrsm->r_start; + for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) { + nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx]; + } + TAILQ_INSERT_AFTER(&rack->r_ctl.rc_map, rsm, nrsm, r_next); + if (rsm->r_in_tmap) { + TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); + nrsm->r_in_tmap = 1; + } + rsm->r_flags &= (~RACK_HAS_FIN); + rsm = nrsm; + } + rack->r_ctl.rc_tlpsend = rsm; + rack->r_ctl.rc_tlp_rtx_out = 1; + if (rsm->r_start == rack->r_ctl.rc_last_tlp_seq) { + rack->r_ctl.rc_tlp_seg_send_cnt++; + tp->t_rxtshift++; + } else { + rack->r_ctl.rc_last_tlp_seq = rsm->r_start; + rack->r_ctl.rc_tlp_seg_send_cnt = 1; + } +send: + rack->r_ctl.rc_tlp_send_cnt++; + if (rack->r_ctl.rc_tlp_send_cnt > rack_tlp_max_resend) { + /* + * Can't [re]/transmit a segment we have not heard from the + * peer in max times. We need the retransmit timer to take + * over. + */ +restore: + rack->r_ctl.rc_tlpsend = NULL; + if (rsm) + rsm->r_flags &= ~RACK_TLP; + rack->r_ctl.rc_prr_sndcnt = old_prr_snd; + counter_u64_add(rack_tlp_retran_fail, 1); + goto out; + } else if (rsm) { + rsm->r_flags |= RACK_TLP; + } + if (rsm && (rsm->r_start == rack->r_ctl.rc_last_tlp_seq) && + (rack->r_ctl.rc_tlp_seg_send_cnt > rack_tlp_max_resend)) { + /* + * We don't want to send a single segment more than the max + * either. + */ + goto restore; + } + rack->r_timer_override = 1; + rack->r_tlp_running = 1; + rack->rc_tlp_in_progress = 1; + rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_TLP; + return (0); +out: + rack->rc_timer_up = 0; + rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_TLP; + return (0); +} + +/* + * Delayed ack Timer, here we simply need to setup the + * ACK_NOW flag and remove the DELACK flag. From there + * the output routine will send the ack out. + * + * We only return 1, saying don't proceed, if all timers + * are stopped (destroyed PCB?). + */ +static int +rack_timeout_delack(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) +{ + if (tp->t_timers->tt_flags & TT_STOPPED) { + return (1); + } + rack_log_to_event(rack, RACK_TO_FRM_DELACK); + tp->t_flags &= ~TF_DELACK; + tp->t_flags |= TF_ACKNOW; + TCPSTAT_INC(tcps_delack); + rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_DELACK; + return (0); +} + +/* + * Persists timer, here we simply need to setup the + * FORCE-DATA flag the output routine will send + * the one byte send. + * + * We only return 1, saying don't proceed, if all timers + * are stopped (destroyed PCB?). + */ +static int +rack_timeout_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) +{ + struct inpcb *inp; + int32_t retval = 0; + + inp = tp->t_inpcb; + + if (tp->t_timers->tt_flags & TT_STOPPED) { + return (1); + } + if (rack->rc_in_persist == 0) + return (0); + if (rack_progress_timeout_check(tp)) { + tcp_set_inp_to_drop(inp, ETIMEDOUT); + return (1); + } + KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); + /* + * Persistence timer into zero window. Force a byte to be output, if + * possible. + */ + TCPSTAT_INC(tcps_persisttimeo); + /* + * Hack: if the peer is dead/unreachable, we do not time out if the + * window is closed. After a full backoff, drop the connection if + * the idle time (no responses to probes) reaches the maximum + * backoff that we would use if retransmitting. + */ + if (tp->t_rxtshift == TCP_MAXRXTSHIFT && + (ticks - tp->t_rcvtime >= tcp_maxpersistidle || + ticks - tp->t_rcvtime >= TCP_REXMTVAL(tp) * tcp_totbackoff)) { + TCPSTAT_INC(tcps_persistdrop); + retval = 1; + tcp_set_inp_to_drop(rack->rc_inp, ETIMEDOUT); + goto out; + } + if ((sbavail(&rack->rc_inp->inp_socket->so_snd) == 0) && + tp->snd_una == tp->snd_max) + rack_exit_persist(tp, rack); + rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_PERSIT; + /* + * If the user has closed the socket then drop a persisting + * connection after a much reduced timeout. + */ + if (tp->t_state > TCPS_CLOSE_WAIT && + (ticks - tp->t_rcvtime) >= TCPTV_PERSMAX) { + retval = 1; + TCPSTAT_INC(tcps_persistdrop); + tcp_set_inp_to_drop(rack->rc_inp, ETIMEDOUT); + goto out; + } + tp->t_flags |= TF_FORCEDATA; +out: + rack_log_to_event(rack, RACK_TO_FRM_PERSIST); + return (retval); +} + +/* + * If a keepalive goes off, we had no other timers + * happening. We always return 1 here since this + * routine either drops the connection or sends + * out a segment with respond. + */ +static int +rack_timeout_keepalive(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) +{ + struct tcptemp *t_template; + struct inpcb *inp; + + if (tp->t_timers->tt_flags & TT_STOPPED) { + return (1); + } + rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_KEEP; + inp = tp->t_inpcb; + rack_log_to_event(rack, RACK_TO_FRM_KEEP); + /* + * Keep-alive timer went off; send something or drop connection if + * idle for too long. + */ + TCPSTAT_INC(tcps_keeptimeo); + if (tp->t_state < TCPS_ESTABLISHED) + goto dropit; + if ((tcp_always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) && + tp->t_state <= TCPS_CLOSING) { + if (ticks - tp->t_rcvtime >= TP_KEEPIDLE(tp) + TP_MAXIDLE(tp)) + goto dropit; + /* + * Send a packet designed to force a response if the peer is + * up and reachable: either an ACK if the connection is + * still alive, or an RST if the peer has closed the + * connection due to timeout or reboot. Using sequence + * number tp->snd_una-1 causes the transmitted zero-length + * segment to lie outside the receive window; by the + * protocol spec, this requires the correspondent TCP to + * respond. + */ + TCPSTAT_INC(tcps_keepprobe); + t_template = tcpip_maketemplate(inp); + if (t_template) { + tcp_respond(tp, t_template->tt_ipgen, + &t_template->tt_t, (struct mbuf *)NULL, + tp->rcv_nxt, tp->snd_una - 1, 0); + free(t_template, M_TEMP); + } + } + rack_start_hpts_timer(rack, tp, cts, __LINE__, 0, 0, 0); + return (1); +dropit: + TCPSTAT_INC(tcps_keepdrops); + tcp_set_inp_to_drop(rack->rc_inp, ETIMEDOUT); + return (1); +} + +/* + * Retransmit helper function, clear up all the ack + * flags and take care of important book keeping. + */ +static void +rack_remxt_tmr(struct tcpcb *tp) +{ + /* + * The retransmit timer went off, all sack'd blocks must be + * un-acked. + */ + struct rack_sendmap *rsm, *trsm = NULL; + struct tcp_rack *rack; + int32_t cnt = 0; + + rack = (struct tcp_rack *)tp->t_fb_ptr; + rack_timer_cancel(tp, rack, tcp_ts_getticks(), __LINE__); + rack_log_to_event(rack, RACK_TO_FRM_TMR); + if (rack->r_state && (rack->r_state != tp->t_state)) + rack_set_state(tp, rack); + /* + * Ideally we would like to be able to + * mark SACK-PASS on anything not acked here. + * However, if we do that we would burst out + * all that data 1ms apart. This would be unwise, + * so for now we will just let the normal rxt timer + * and tlp timer take care of it. + */ + TAILQ_FOREACH(rsm, &rack->r_ctl.rc_map, r_next) { + if (rsm->r_flags & RACK_ACKED) { + cnt++; + rsm->r_sndcnt = 0; + if (rsm->r_in_tmap == 0) { + /* We must re-add it back to the tlist */ + if (trsm == NULL) { + TAILQ_INSERT_HEAD(&rack->r_ctl.rc_tmap, rsm, r_tnext); + } else { + TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, trsm, rsm, r_tnext); + } + rsm->r_in_tmap = 1; + trsm = rsm; + } + } + rsm->r_flags &= ~(RACK_ACKED | RACK_SACK_PASSED | RACK_WAS_SACKPASS); + } + /* Clear the count (we just un-acked them) */ + rack->r_ctl.rc_sacked = 0; + /* Clear the tlp rtx mark */ + rack->r_ctl.rc_tlp_rtx_out = 0; + rack->r_ctl.rc_tlp_seg_send_cnt = 0; + rack->r_ctl.rc_resend = TAILQ_FIRST(&rack->r_ctl.rc_map); + /* Setup so we send one segment */ + if (rack->r_ctl.rc_prr_sndcnt < tp->t_maxseg) + rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg; + rack->r_timer_override = 1; +} + +/* + * Re-transmit timeout! If we drop the PCB we will return 1, otherwise + * we will setup to retransmit the lowest seq number outstanding. + */ +static int +rack_timeout_rxt(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) +{ + int32_t rexmt; + struct inpcb *inp; + int32_t retval = 0; + + inp = tp->t_inpcb; + if (tp->t_timers->tt_flags & TT_STOPPED) { + return (1); + } + if (rack_progress_timeout_check(tp)) { + tcp_set_inp_to_drop(inp, ETIMEDOUT); + return (1); + } + rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_RXT; + if (TCPS_HAVEESTABLISHED(tp->t_state) && + (tp->snd_una == tp->snd_max)) { + /* Nothing outstanding .. nothing to do */ + return (0); + } + /* + * Retransmission timer went off. Message has not been acked within + * retransmit interval. Back off to a longer retransmit interval + * and retransmit one segment. + */ + if (++tp->t_rxtshift > TCP_MAXRXTSHIFT) { + tp->t_rxtshift = TCP_MAXRXTSHIFT; + TCPSTAT_INC(tcps_timeoutdrop); + retval = 1; + tcp_set_inp_to_drop(rack->rc_inp, + (tp->t_softerror ? (uint16_t) tp->t_softerror : ETIMEDOUT)); + goto out; + } + rack_remxt_tmr(tp); + if (tp->t_state == TCPS_SYN_SENT) { + /* + * If the SYN was retransmitted, indicate CWND to be limited + * to 1 segment in cc_conn_init(). + */ + tp->snd_cwnd = 1; + } else if (tp->t_rxtshift == 1) { + /* + * first retransmit; record ssthresh and cwnd so they can be + * recovered if this turns out to be a "bad" retransmit. A + * retransmit is considered "bad" if an ACK for this segment + * is received within RTT/2 interval; the assumption here is + * that the ACK was already in flight. See "On Estimating + * End-to-End Network Path Properties" by Allman and Paxson + * for more details. + */ + tp->snd_cwnd_prev = tp->snd_cwnd; + tp->snd_ssthresh_prev = tp->snd_ssthresh; + tp->snd_recover_prev = tp->snd_recover; + if (IN_FASTRECOVERY(tp->t_flags)) + tp->t_flags |= TF_WASFRECOVERY; + else + tp->t_flags &= ~TF_WASFRECOVERY; + if (IN_CONGRECOVERY(tp->t_flags)) + tp->t_flags |= TF_WASCRECOVERY; + else + tp->t_flags &= ~TF_WASCRECOVERY; + tp->t_badrxtwin = ticks + (tp->t_srtt >> (TCP_RTT_SHIFT + 1)); + tp->t_flags |= TF_PREVVALID; + } else + tp->t_flags &= ~TF_PREVVALID; + TCPSTAT_INC(tcps_rexmttimeo); + if ((tp->t_state == TCPS_SYN_SENT) || + (tp->t_state == TCPS_SYN_RECEIVED)) + rexmt = MSEC_2_TICKS(RACK_INITIAL_RTO * tcp_syn_backoff[tp->t_rxtshift]); + else + rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift]; + TCPT_RANGESET(tp->t_rxtcur, rexmt, + max(MSEC_2_TICKS(rack_rto_min), rexmt), + MSEC_2_TICKS(rack_rto_max)); + /* + * We enter the path for PLMTUD if connection is established or, if + * connection is FIN_WAIT_1 status, reason for the last is that if + * amount of data we send is very small, we could send it in couple + * of packets and process straight to FIN. In that case we won't + * catch ESTABLISHED state. + */ + if (V_tcp_pmtud_blackhole_detect && (((tp->t_state == TCPS_ESTABLISHED)) + || (tp->t_state == TCPS_FIN_WAIT_1))) { +#ifdef INET6 + int32_t isipv6; +#endif + + /* + * Idea here is that at each stage of mtu probe (usually, + * 1448 -> 1188 -> 524) should be given 2 chances to recover + * before further clamping down. 'tp->t_rxtshift % 2 == 0' + * should take care of that. + */ + if (((tp->t_flags2 & (TF2_PLPMTU_PMTUD | TF2_PLPMTU_MAXSEGSNT)) == + (TF2_PLPMTU_PMTUD | TF2_PLPMTU_MAXSEGSNT)) && + (tp->t_rxtshift >= 2 && tp->t_rxtshift < 6 && + tp->t_rxtshift % 2 == 0)) { + /* + * Enter Path MTU Black-hole Detection mechanism: - + * Disable Path MTU Discovery (IP "DF" bit). - + * Reduce MTU to lower value than what we negotiated + * with peer. + */ + if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) == 0) { + /* Record that we may have found a black hole. */ + tp->t_flags2 |= TF2_PLPMTU_BLACKHOLE; + /* Keep track of previous MSS. */ + tp->t_pmtud_saved_maxseg = tp->t_maxseg; + } + + /* + * Reduce the MSS to blackhole value or to the + * default in an attempt to retransmit. + */ +#ifdef INET6 + isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) ? 1 : 0; + if (isipv6 && + tp->t_maxseg > V_tcp_v6pmtud_blackhole_mss) { + /* Use the sysctl tuneable blackhole MSS. */ + tp->t_maxseg = V_tcp_v6pmtud_blackhole_mss; + TCPSTAT_INC(tcps_pmtud_blackhole_activated); + } else if (isipv6) { + /* Use the default MSS. */ + tp->t_maxseg = V_tcp_v6mssdflt; + /* + * Disable Path MTU Discovery when we switch + * to minmss. + */ + tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; + TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss); + } +#endif +#if defined(INET6) && defined(INET) + else +#endif +#ifdef INET + if (tp->t_maxseg > V_tcp_pmtud_blackhole_mss) { + /* Use the sysctl tuneable blackhole MSS. */ + tp->t_maxseg = V_tcp_pmtud_blackhole_mss; + TCPSTAT_INC(tcps_pmtud_blackhole_activated); + } else { + /* Use the default MSS. */ + tp->t_maxseg = V_tcp_mssdflt; + /* + * Disable Path MTU Discovery when we switch + * to minmss. + */ + tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; + TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss); + } +#endif + } else { + /* + * If further retransmissions are still unsuccessful + * with a lowered MTU, maybe this isn't a blackhole + * and we restore the previous MSS and blackhole + * detection flags. The limit '6' is determined by + * giving each probe stage (1448, 1188, 524) 2 + * chances to recover. + */ + if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) && + (tp->t_rxtshift >= 6)) { + tp->t_flags2 |= TF2_PLPMTU_PMTUD; + tp->t_flags2 &= ~TF2_PLPMTU_BLACKHOLE; + tp->t_maxseg = tp->t_pmtud_saved_maxseg; + TCPSTAT_INC(tcps_pmtud_blackhole_failed); + } + } + } + /* + * Disable RFC1323 and SACK if we haven't got any response to our + * third SYN to work-around some broken terminal servers (most of + * which have hopefully been retired) that have bad VJ header + * compression code which trashes TCP segments containing + * unknown-to-them TCP options. + */ + if (tcp_rexmit_drop_options && (tp->t_state == TCPS_SYN_SENT) && + (tp->t_rxtshift == 3)) + tp->t_flags &= ~(TF_REQ_SCALE | TF_REQ_TSTMP | TF_SACK_PERMIT); + /* + * If we backed off this far, our srtt estimate is probably bogus. + * Clobber it so we'll take the next rtt measurement as our srtt; + * move the current srtt into rttvar to keep the current retransmit + * times until then. + */ + if (tp->t_rxtshift > TCP_MAXRXTSHIFT / 4) { +#ifdef INET6 + if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) + in6_losing(tp->t_inpcb); + else +#endif + in_losing(tp->t_inpcb); + tp->t_rttvar += (tp->t_srtt >> TCP_RTT_SHIFT); + tp->t_srtt = 0; + } + if (rack_use_sack_filter) + sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una); + tp->snd_recover = tp->snd_max; + tp->t_flags |= TF_ACKNOW; + tp->t_rtttime = 0; + rack_cong_signal(tp, NULL, CC_RTO); +out: + return (retval); +} + +static int +rack_process_timers(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, uint8_t hpts_calling) +{ + int32_t ret = 0; + int32_t timers = (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK); + + if (timers == 0) { + return (0); + } + if (tp->t_state == TCPS_LISTEN) { + /* no timers on listen sockets */ + if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) + return (0); + return (1); + } + if (TSTMP_LT(cts, rack->r_ctl.rc_timer_exp)) { + uint32_t left; + + if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) { + ret = -1; + rack_log_to_processing(rack, cts, ret, 0); + return (0); + } + if (hpts_calling == 0) { + ret = -2; + rack_log_to_processing(rack, cts, ret, 0); + return (0); + } + /* + * Ok our timer went off early and we are not paced false + * alarm, go back to sleep. + */ + ret = -3; + left = rack->r_ctl.rc_timer_exp - cts; + tcp_hpts_insert(tp->t_inpcb, HPTS_MS_TO_SLOTS(left)); + rack_log_to_processing(rack, cts, ret, left); + rack->rc_last_pto_set = 0; + return (1); + } + rack->rc_tmr_stopped = 0; + rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_MASK; + if (timers & PACE_TMR_DELACK) { + ret = rack_timeout_delack(tp, rack, cts); + } else if (timers & PACE_TMR_RACK) { + ret = rack_timeout_rack(tp, rack, cts); + } else if (timers & PACE_TMR_TLP) { + ret = rack_timeout_tlp(tp, rack, cts); + } else if (timers & PACE_TMR_RXT) { + ret = rack_timeout_rxt(tp, rack, cts); + } else if (timers & PACE_TMR_PERSIT) { + ret = rack_timeout_persist(tp, rack, cts); + } else if (timers & PACE_TMR_KEEP) { + ret = rack_timeout_keepalive(tp, rack, cts); + } + rack_log_to_processing(rack, cts, ret, timers); + return (ret); +} + +static void +rack_timer_cancel(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int line) +{ + uint8_t hpts_removed = 0; + + if ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) && + TSTMP_GEQ(cts, rack->r_ctl.rc_last_output_to)) { + tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT); + hpts_removed = 1; + } + if (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) { + rack->rc_tmr_stopped = rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK; + if (rack->rc_inp->inp_in_hpts && + ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0)) { + /* + * Canceling timer's when we have no output being + * paced. We also must remove ourselves from the + * hpts. + */ + tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT); + hpts_removed = 1; + } + rack_log_to_cancel(rack, hpts_removed, line); + rack->r_ctl.rc_hpts_flags &= ~(PACE_TMR_MASK); + } +} + +static void +rack_timer_stop(struct tcpcb *tp, uint32_t timer_type) +{ + return; +} + +static int +rack_stopall(struct tcpcb *tp) +{ + struct tcp_rack *rack; + rack = (struct tcp_rack *)tp->t_fb_ptr; + rack->t_timers_stopped = 1; + return (0); +} + +static void +rack_timer_activate(struct tcpcb *tp, uint32_t timer_type, uint32_t delta) +{ + return; +} + +static int +rack_timer_active(struct tcpcb *tp, uint32_t timer_type) +{ + return (0); +} + +static void +rack_stop_all_timers(struct tcpcb *tp) +{ + struct tcp_rack *rack; + + /* + * Assure no timers are running. + */ + if (tcp_timer_active(tp, TT_PERSIST)) { + /* We enter in persists, set the flag appropriately */ + rack = (struct tcp_rack *)tp->t_fb_ptr; + rack->rc_in_persist = 1; + } + tcp_timer_suspend(tp, TT_PERSIST); + tcp_timer_suspend(tp, TT_REXMT); + tcp_timer_suspend(tp, TT_KEEP); + tcp_timer_suspend(tp, TT_DELACK); +} + +static void +rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack, + struct rack_sendmap *rsm, uint32_t ts) +{ + int32_t idx; + + rsm->r_rtr_cnt++; + rsm->r_sndcnt++; + if (rsm->r_rtr_cnt > RACK_NUM_OF_RETRANS) { + rsm->r_rtr_cnt = RACK_NUM_OF_RETRANS; + rsm->r_flags |= RACK_OVERMAX; + } + if ((rsm->r_rtr_cnt > 1) && (rack->r_tlp_running == 0)) { + rack->r_ctl.rc_holes_rxt += (rsm->r_end - rsm->r_start); + rsm->r_rtr_bytes += (rsm->r_end - rsm->r_start); + } + idx = rsm->r_rtr_cnt - 1; + rsm->r_tim_lastsent[idx] = ts; + if (rsm->r_flags & RACK_ACKED) { + /* Problably MTU discovery messing with us */ + rsm->r_flags &= ~RACK_ACKED; + rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start); + } + if (rsm->r_in_tmap) { + TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext); + } + TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext); + rsm->r_in_tmap = 1; + if (rsm->r_flags & RACK_SACK_PASSED) { + /* We have retransmitted due to the SACK pass */ + rsm->r_flags &= ~RACK_SACK_PASSED; + rsm->r_flags |= RACK_WAS_SACKPASS; + } + /* Update memory for next rtr */ + rack->r_ctl.rc_next = TAILQ_NEXT(rsm, r_next); +} + + +static uint32_t +rack_update_entry(struct tcpcb *tp, struct tcp_rack *rack, + struct rack_sendmap *rsm, uint32_t ts, int32_t * lenp) +{ + /* + * We (re-)transmitted starting at rsm->r_start for some length + * (possibly less than r_end. + */ + struct rack_sendmap *nrsm; + uint32_t c_end; + int32_t len; + int32_t idx; + + len = *lenp; + c_end = rsm->r_start + len; + if (SEQ_GEQ(c_end, rsm->r_end)) { + /* + * We retransmitted the whole piece or more than the whole + * slopping into the next rsm. + */ + rack_update_rsm(tp, rack, rsm, ts); + if (c_end == rsm->r_end) { + *lenp = 0; + return (0); + } else { + int32_t act_len; + + /* Hangs over the end return whats left */ + act_len = rsm->r_end - rsm->r_start; + *lenp = (len - act_len); + return (rsm->r_end); + } + /* We don't get out of this block. */ + } + /* + * Here we retransmitted less than the whole thing which means we + * have to split this into what was transmitted and what was not. + */ + nrsm = rack_alloc(rack); + if (nrsm == NULL) { + /* + * We can't get memory, so lets not proceed. + */ + *lenp = 0; + return (0); + } + /* + * So here we are going to take the original rsm and make it what we + * retransmitted. nrsm will be the tail portion we did not + * retransmit. For example say the chunk was 1, 11 (10 bytes). And + * we retransmitted 5 bytes i.e. 1, 5. The original piece shrinks to + * 1, 6 and the new piece will be 6, 11. + */ + nrsm->r_start = c_end; + nrsm->r_end = rsm->r_end; + nrsm->r_rtr_cnt = rsm->r_rtr_cnt; + nrsm->r_flags = rsm->r_flags; + nrsm->r_sndcnt = rsm->r_sndcnt; + nrsm->r_rtr_bytes = 0; + rsm->r_end = c_end; + for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) { + nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx]; + } + TAILQ_INSERT_AFTER(&rack->r_ctl.rc_map, rsm, nrsm, r_next); + if (rsm->r_in_tmap) { + TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); + nrsm->r_in_tmap = 1; + } + rsm->r_flags &= (~RACK_HAS_FIN); + rack_update_rsm(tp, rack, rsm, ts); + *lenp = 0; + return (0); +} + + +static void +rack_log_output(struct tcpcb *tp, struct tcpopt *to, int32_t len, + uint32_t seq_out, uint8_t th_flags, int32_t err, uint32_t ts, + uint8_t pass, struct rack_sendmap *hintrsm) +{ + struct tcp_rack *rack; + struct rack_sendmap *rsm, *nrsm; + register uint32_t snd_max, snd_una; + int32_t idx; + + /* + * Add to the RACK log of packets in flight or retransmitted. If + * there is a TS option we will use the TS echoed, if not we will + * grab a TS. + * + * Retransmissions will increment the count and move the ts to its + * proper place. Note that if options do not include TS's then we + * won't be able to effectively use the ACK for an RTT on a retran. + * + * Notes about r_start and r_end. Lets consider a send starting at + * sequence 1 for 10 bytes. In such an example the r_start would be + * 1 (starting sequence) but the r_end would be r_start+len i.e. 11. + * This means that r_end is actually the first sequence for the next + * slot (11). + * + */ + /* + * If err is set what do we do XXXrrs? should we not add the thing? + * -- i.e. return if err != 0 or should we pretend we sent it? -- + * i.e. proceed with add ** do this for now. + */ + INP_WLOCK_ASSERT(tp->t_inpcb); + if (err) + /* + * We don't log errors -- we could but snd_max does not + * advance in this case either. + */ + return; + + if (th_flags & TH_RST) { + /* + * We don't log resets and we return immediately from + * sending + */ + return; + } + rack = (struct tcp_rack *)tp->t_fb_ptr; + snd_una = tp->snd_una; + if (SEQ_LEQ((seq_out + len), snd_una)) { + /* Are sending an old segment to induce an ack (keep-alive)? */ + return; + } + if (SEQ_LT(seq_out, snd_una)) { + /* huh? should we panic? */ + uint32_t end; + + end = seq_out + len; + seq_out = snd_una; + len = end - seq_out; + } + snd_max = tp->snd_max; + if (th_flags & (TH_SYN | TH_FIN)) { + /* + * The call to rack_log_output is made before bumping + * snd_max. This means we can record one extra byte on a SYN + * or FIN if seq_out is adding more on and a FIN is present + * (and we are not resending). + */ + if (th_flags & TH_SYN) + len++; + if (th_flags & TH_FIN) + len++; + if (SEQ_LT(snd_max, tp->snd_nxt)) { + /* + * The add/update as not been done for the FIN/SYN + * yet. + */ + snd_max = tp->snd_nxt; + } + } + if (len == 0) { + /* We don't log zero window probes */ + return; + } + rack->r_ctl.rc_time_last_sent = ts; + if (IN_RECOVERY(tp->t_flags)) { + rack->r_ctl.rc_prr_out += len; + } + /* First question is it a retransmission? */ + if (seq_out == snd_max) { +again: + rsm = rack_alloc(rack); + if (rsm == NULL) { + /* + * Hmm out of memory and the tcb got destroyed while + * we tried to wait. + */ +#ifdef INVARIANTS + panic("Out of memory when we should not be rack:%p", rack); +#endif + return; + } + if (th_flags & TH_FIN) { + rsm->r_flags = RACK_HAS_FIN; + } else { + rsm->r_flags = 0; + } + rsm->r_tim_lastsent[0] = ts; + rsm->r_rtr_cnt = 1; + rsm->r_rtr_bytes = 0; + rsm->r_start = seq_out; + rsm->r_end = rsm->r_start + len; + rsm->r_sndcnt = 0; + TAILQ_INSERT_TAIL(&rack->r_ctl.rc_map, rsm, r_next); + TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext); + rsm->r_in_tmap = 1; + return; + } + /* + * If we reach here its a retransmission and we need to find it. + */ +more: + if (hintrsm && (hintrsm->r_start == seq_out)) { + rsm = hintrsm; + hintrsm = NULL; + } else if (rack->r_ctl.rc_next) { + /* We have a hint from a previous run */ + rsm = rack->r_ctl.rc_next; + } else { + /* No hints sorry */ + rsm = NULL; + } + if ((rsm) && (rsm->r_start == seq_out)) { + /* + * We used rc_next or hintrsm to retransmit, hopefully the + * likely case. + */ + seq_out = rack_update_entry(tp, rack, rsm, ts, &len); + if (len == 0) { + return; + } else { + goto more; + } + } + /* Ok it was not the last pointer go through it the hard way. */ + TAILQ_FOREACH(rsm, &rack->r_ctl.rc_map, r_next) { + if (rsm->r_start == seq_out) { + seq_out = rack_update_entry(tp, rack, rsm, ts, &len); + rack->r_ctl.rc_next = TAILQ_NEXT(rsm, r_next); + if (len == 0) { + return; + } else { + continue; + } + } + if (SEQ_GEQ(seq_out, rsm->r_start) && SEQ_LT(seq_out, rsm->r_end)) { + /* Transmitted within this piece */ + /* + * Ok we must split off the front and then let the + * update do the rest + */ + nrsm = rack_alloc(rack); + if (nrsm == NULL) { +#ifdef INVARIANTS + panic("Ran out of memory that was preallocated? rack:%p", rack); +#endif + rack_update_rsm(tp, rack, rsm, ts); + return; + } + /* + * copy rsm to nrsm and then trim the front of rsm + * to not include this part. + */ + nrsm->r_start = seq_out; + nrsm->r_end = rsm->r_end; + nrsm->r_rtr_cnt = rsm->r_rtr_cnt; + nrsm->r_flags = rsm->r_flags; + nrsm->r_sndcnt = rsm->r_sndcnt; + nrsm->r_rtr_bytes = 0; + for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) { + nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx]; + } + rsm->r_end = nrsm->r_start; + TAILQ_INSERT_AFTER(&rack->r_ctl.rc_map, rsm, nrsm, r_next); + if (rsm->r_in_tmap) { + TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); + nrsm->r_in_tmap = 1; + } + rsm->r_flags &= (~RACK_HAS_FIN); + seq_out = rack_update_entry(tp, rack, nrsm, ts, &len); + if (len == 0) { + return; + } + } + } + /* + * Hmm not found in map did they retransmit both old and on into the + * new? + */ + if (seq_out == tp->snd_max) { + goto again; + } else if (SEQ_LT(seq_out, tp->snd_max)) { +#ifdef INVARIANTS + printf("seq_out:%u len:%d snd_una:%u snd_max:%u -- but rsm not found?\n", + seq_out, len, tp->snd_una, tp->snd_max); + printf("Starting Dump of all rack entries\n"); + TAILQ_FOREACH(rsm, &rack->r_ctl.rc_map, r_next) { + printf("rsm:%p start:%u end:%u\n", + rsm, rsm->r_start, rsm->r_end); + } + printf("Dump complete\n"); + panic("seq_out not found rack:%p tp:%p", + rack, tp); +#endif + } else { +#ifdef INVARIANTS + /* + * Hmm beyond sndmax? (only if we are using the new rtt-pack + * flag) + */ + panic("seq_out:%u(%d) is beyond snd_max:%u tp:%p", + seq_out, len, tp->snd_max, tp); +#endif + } +} + +/* + * Record one of the RTT updates from an ack into + * our sample structure. + */ +static void +tcp_rack_xmit_timer(struct tcp_rack *rack, int32_t rtt) +{ + if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) || + (rack->r_ctl.rack_rs.rs_rtt_lowest > rtt)) { + rack->r_ctl.rack_rs.rs_rtt_lowest = rtt; + } + if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) || + (rack->r_ctl.rack_rs.rs_rtt_highest < rtt)) { + rack->r_ctl.rack_rs.rs_rtt_highest = rtt; + } + rack->r_ctl.rack_rs.rs_flags = RACK_RTT_VALID; + rack->r_ctl.rack_rs.rs_rtt_tot += rtt; + rack->r_ctl.rack_rs.rs_rtt_cnt++; +} + +/* + * Collect new round-trip time estimate + * and update averages and current timeout. + */ +static void +tcp_rack_xmit_timer_commit(struct tcp_rack *rack, struct tcpcb *tp) +{ + int32_t delta; + uint32_t o_srtt, o_var; + int32_t rtt; + + if (rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) + /* No valid sample */ + return; + if (rack->r_ctl.rc_rate_sample_method == USE_RTT_LOW) { + /* We are to use the lowest RTT seen in a single ack */ + rtt = rack->r_ctl.rack_rs.rs_rtt_lowest; + } else if (rack->r_ctl.rc_rate_sample_method == USE_RTT_HIGH) { + /* We are to use the highest RTT seen in a single ack */ + rtt = rack->r_ctl.rack_rs.rs_rtt_highest; + } else if (rack->r_ctl.rc_rate_sample_method == USE_RTT_AVG) { + /* We are to use the average RTT seen in a single ack */ + rtt = (int32_t)(rack->r_ctl.rack_rs.rs_rtt_tot / + (uint64_t)rack->r_ctl.rack_rs.rs_rtt_cnt); + } else { +#ifdef INVARIANTS + panic("Unknown rtt variant %d", rack->r_ctl.rc_rate_sample_method); +#endif + return; + } + if (rtt == 0) + rtt = 1; + rack_log_rtt_sample(rack, rtt); + o_srtt = tp->t_srtt; + o_var = tp->t_rttvar; + rack = (struct tcp_rack *)tp->t_fb_ptr; + if (tp->t_srtt != 0) { + /* + * srtt is stored as fixed point with 5 bits after the + * binary point (i.e., scaled by 8). The following magic is + * equivalent to the smoothing algorithm in rfc793 with an + * alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed point). + * Adjust rtt to origin 0. + */ + delta = ((rtt - 1) << TCP_DELTA_SHIFT) + - (tp->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT)); + + tp->t_srtt += delta; + if (tp->t_srtt <= 0) + tp->t_srtt = 1; + + /* + * We accumulate a smoothed rtt variance (actually, a + * smoothed mean difference), then set the retransmit timer + * to smoothed rtt + 4 times the smoothed variance. rttvar + * is stored as fixed point with 4 bits after the binary + * point (scaled by 16). The following is equivalent to + * rfc793 smoothing with an alpha of .75 (rttvar = + * rttvar*3/4 + |delta| / 4). This replaces rfc793's + * wired-in beta. + */ + if (delta < 0) + delta = -delta; + delta -= tp->t_rttvar >> (TCP_RTTVAR_SHIFT - TCP_DELTA_SHIFT); + tp->t_rttvar += delta; + if (tp->t_rttvar <= 0) + tp->t_rttvar = 1; + if (tp->t_rttbest > tp->t_srtt + tp->t_rttvar) + tp->t_rttbest = tp->t_srtt + tp->t_rttvar; + } else { + /* + * No rtt measurement yet - use the unsmoothed rtt. Set the + * variance to half the rtt (so our first retransmit happens + * at 3*rtt). + */ + tp->t_srtt = rtt << TCP_RTT_SHIFT; + tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - 1); + tp->t_rttbest = tp->t_srtt + tp->t_rttvar; + } + TCPSTAT_INC(tcps_rttupdated); + rack_log_rtt_upd(tp, rack, rtt, o_srtt, o_var); + tp->t_rttupdated++; +#ifdef NETFLIX_STATS + stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RTT, imax(0, rtt)); +#endif + tp->t_rxtshift = 0; + + /* + * the retransmit should happen at rtt + 4 * rttvar. Because of the + * way we do the smoothing, srtt and rttvar will each average +1/2 + * tick of bias. When we compute the retransmit timer, we want 1/2 + * tick of rounding and 1 extra tick because of +-1/2 tick + * uncertainty in the firing of the timer. The bias will give us + * exactly the 1.5 tick we need. But, because the bias is + * statistical, we have to test that we don't drop below the minimum + * feasible timer (which is 2 ticks). + */ + TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp), + max(MSEC_2_TICKS(rack_rto_min), rtt + 2), MSEC_2_TICKS(rack_rto_max)); + tp->t_softerror = 0; +} + +static void +rack_earlier_retran(struct tcpcb *tp, struct rack_sendmap *rsm, + uint32_t t, uint32_t cts) +{ + /* + * For this RSM, we acknowledged the data from a previous + * transmission, not the last one we made. This means we did a false + * retransmit. + */ + struct tcp_rack *rack; + + if (rsm->r_flags & RACK_HAS_FIN) { + /* + * The sending of the FIN often is multiple sent when we + * have everything outstanding ack'd. We ignore this case + * since its over now. + */ + return; + } + if (rsm->r_flags & RACK_TLP) { + /* + * We expect TLP's to have this occur. + */ + return; + } + rack = (struct tcp_rack *)tp->t_fb_ptr; + /* should we undo cc changes and exit recovery? */ + if (IN_RECOVERY(tp->t_flags)) { + if (rack->r_ctl.rc_rsm_start == rsm->r_start) { + /* + * Undo what we ratched down and exit recovery if + * possible + */ + EXIT_RECOVERY(tp->t_flags); + tp->snd_recover = tp->snd_una; + if (rack->r_ctl.rc_cwnd_at > tp->snd_cwnd) + tp->snd_cwnd = rack->r_ctl.rc_cwnd_at; + if (rack->r_ctl.rc_ssthresh_at > tp->snd_ssthresh) + tp->snd_ssthresh = rack->r_ctl.rc_ssthresh_at; + } + } + if (rsm->r_flags & RACK_WAS_SACKPASS) { + /* + * We retransmitted based on a sack and the earlier + * retransmission ack'd it - re-ordering is occuring. + */ + counter_u64_add(rack_reorder_seen, 1); + rack->r_ctl.rc_reorder_ts = cts; + } + counter_u64_add(rack_badfr, 1); + counter_u64_add(rack_badfr_bytes, (rsm->r_end - rsm->r_start)); +} + + +static int +rack_update_rtt(struct tcpcb *tp, struct tcp_rack *rack, + struct rack_sendmap *rsm, struct tcpopt *to, uint32_t cts, int32_t ack_type) +{ + int32_t i; + uint32_t t; + + if (rsm->r_flags & RACK_ACKED) + /* Already done */ + return (0); + + + if ((rsm->r_rtr_cnt == 1) || + ((ack_type == CUM_ACKED) && + (to->to_flags & TOF_TS) && + (to->to_tsecr) && + (rsm->r_tim_lastsent[rsm->r_rtr_cnt - 1] == to->to_tsecr)) + ) { + /* + * We will only find a matching timestamp if its cum-acked. + * But if its only one retransmission its for-sure matching + * :-) + */ + t = cts - rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]; + if ((int)t <= 0) + t = 1; + if (!tp->t_rttlow || tp->t_rttlow > t) + tp->t_rttlow = t; + if (!rack->r_ctl.rc_rack_min_rtt || + SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) { + rack->r_ctl.rc_rack_min_rtt = t; + if (rack->r_ctl.rc_rack_min_rtt == 0) { + rack->r_ctl.rc_rack_min_rtt = 1; + } + } + tcp_rack_xmit_timer(rack, TCP_TS_TO_TICKS(t) + 1); + if ((rsm->r_flags & RACK_TLP) && + (!IN_RECOVERY(tp->t_flags))) { + /* Segment was a TLP and our retrans matched */ + if (rack->r_ctl.rc_tlp_cwnd_reduce) { + rack->r_ctl.rc_rsm_start = tp->snd_max; + rack->r_ctl.rc_cwnd_at = tp->snd_cwnd; + rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh; + rack_cong_signal(tp, NULL, CC_NDUPACK); + /* + * When we enter recovery we need to assure + * we send one packet. + */ + rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg; + } else + rack->r_ctl.rc_tlp_rtx_out = 0; + } + if (SEQ_LT(rack->r_ctl.rc_rack_tmit_time, rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)])) { + /* New more recent rack_tmit_time */ + rack->r_ctl.rc_rack_tmit_time = rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]; + rack->rc_rack_rtt = t; + } + return (1); + } + /* + * We clear the soft/rxtshift since we got an ack. + * There is no assurance we will call the commit() function + * so we need to clear these to avoid incorrect handling. + */ + tp->t_rxtshift = 0; + tp->t_softerror = 0; + if ((to->to_flags & TOF_TS) && + (ack_type == CUM_ACKED) && + (to->to_tsecr) && + ((rsm->r_flags & (RACK_DEFERRED | RACK_OVERMAX)) == 0)) { + /* + * Now which timestamp does it match? In this block the ACK + * must be coming from a previous transmission. + */ + for (i = 0; i < rsm->r_rtr_cnt; i++) { + if (rsm->r_tim_lastsent[i] == to->to_tsecr) { + t = cts - rsm->r_tim_lastsent[i]; + if ((int)t <= 0) + t = 1; + if ((i + 1) < rsm->r_rtr_cnt) { + /* Likely */ + rack_earlier_retran(tp, rsm, t, cts); + } + if (!tp->t_rttlow || tp->t_rttlow > t) + tp->t_rttlow = t; + if (!rack->r_ctl.rc_rack_min_rtt || SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) { + rack->r_ctl.rc_rack_min_rtt = t; + if (rack->r_ctl.rc_rack_min_rtt == 0) { + rack->r_ctl.rc_rack_min_rtt = 1; + } + } + /* + * Note the following calls to + * tcp_rack_xmit_timer() are being commented + * out for now. They give us no more accuracy + * and often lead to a wrong choice. We have + * enough samples that have not been + * retransmitted. I leave the commented out + * code in here in case in the future we + * decide to add it back (though I can't forsee + * doing that). That way we will easily see + * where they need to be placed. + */ + if (SEQ_LT(rack->r_ctl.rc_rack_tmit_time, + rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)])) { + /* New more recent rack_tmit_time */ + rack->r_ctl.rc_rack_tmit_time = rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]; + rack->rc_rack_rtt = t; + } + return (1); + } + } + goto ts_not_found; + } else { + /* + * Ok its a SACK block that we retransmitted. or a windows + * machine without timestamps. We can tell nothing from the + * time-stamp since its not there or the time the peer last + * recieved a segment that moved forward its cum-ack point. + */ +ts_not_found: + i = rsm->r_rtr_cnt - 1; + t = cts - rsm->r_tim_lastsent[i]; + if ((int)t <= 0) + t = 1; + if (rack->r_ctl.rc_rack_min_rtt && SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) { + /* + * We retransmitted and the ack came back in less + * than the smallest rtt we have observed. We most + * likey did an improper retransmit as outlined in + * 4.2 Step 3 point 2 in the rack-draft. + */ + i = rsm->r_rtr_cnt - 2; + t = cts - rsm->r_tim_lastsent[i]; + rack_earlier_retran(tp, rsm, t, cts); + } else if (rack->r_ctl.rc_rack_min_rtt) { + /* + * We retransmitted it and the retransmit did the + * job. + */ + if (!rack->r_ctl.rc_rack_min_rtt || + SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) { + rack->r_ctl.rc_rack_min_rtt = t; + if (rack->r_ctl.rc_rack_min_rtt == 0) { + rack->r_ctl.rc_rack_min_rtt = 1; + } + } + if (SEQ_LT(rack->r_ctl.rc_rack_tmit_time, rsm->r_tim_lastsent[i])) { + /* New more recent rack_tmit_time */ + rack->r_ctl.rc_rack_tmit_time = rsm->r_tim_lastsent[i]; + rack->rc_rack_rtt = t; + } + return (1); + } + } + return (0); +} + +/* + * Mark the SACK_PASSED flag on all entries prior to rsm send wise. + */ +static void +rack_log_sack_passed(struct tcpcb *tp, + struct tcp_rack *rack, struct rack_sendmap *rsm) +{ + struct rack_sendmap *nrsm; + uint32_t ts; + int32_t idx; + + idx = rsm->r_rtr_cnt - 1; + ts = rsm->r_tim_lastsent[idx]; + nrsm = rsm; + TAILQ_FOREACH_REVERSE_FROM(nrsm, &rack->r_ctl.rc_tmap, + rack_head, r_tnext) { + if (nrsm == rsm) { + /* Skip orginal segment he is acked */ + continue; + } + if (nrsm->r_flags & RACK_ACKED) { + /* Skip ack'd segments */ + continue; + } + idx = nrsm->r_rtr_cnt - 1; + if (ts == nrsm->r_tim_lastsent[idx]) { + /* + * For this case lets use seq no, if we sent in a + * big block (TSO) we would have a bunch of segments + * sent at the same time. + * + * We would only get a report if its SEQ is earlier. + * If we have done multiple retransmits the times + * would not be equal. + */ + if (SEQ_LT(nrsm->r_start, rsm->r_start)) { + nrsm->r_flags |= RACK_SACK_PASSED; + nrsm->r_flags &= ~RACK_WAS_SACKPASS; + } + } else { + /* + * Here they were sent at different times, not a big + * block. Since we transmitted this one later and + * see it sack'd then this must also be missing (or + * we would have gotten a sack block for it) + */ + nrsm->r_flags |= RACK_SACK_PASSED; + nrsm->r_flags &= ~RACK_WAS_SACKPASS; + } + } +} + +static uint32_t +rack_proc_sack_blk(struct tcpcb *tp, struct tcp_rack *rack, struct sackblk *sack, + struct tcpopt *to, struct rack_sendmap **prsm, uint32_t cts) +{ + int32_t idx; + int32_t times = 0; + uint32_t start, end, changed = 0; + struct rack_sendmap *rsm, *nrsm; + int32_t used_ref = 1; + + start = sack->start; + end = sack->end; + rsm = *prsm; + if (rsm && SEQ_LT(start, rsm->r_start)) { + TAILQ_FOREACH_REVERSE_FROM(rsm, &rack->r_ctl.rc_map, rack_head, r_next) { + if (SEQ_GEQ(start, rsm->r_start) && + SEQ_LT(start, rsm->r_end)) { + goto do_rest_ofb; + } + } + } + if (rsm == NULL) { +start_at_beginning: + rsm = NULL; + used_ref = 0; + } + /* First lets locate the block where this guy is */ + TAILQ_FOREACH_FROM(rsm, &rack->r_ctl.rc_map, r_next) { + if (SEQ_GEQ(start, rsm->r_start) && + SEQ_LT(start, rsm->r_end)) { + break; + } + } +do_rest_ofb: + if (rsm == NULL) { + /* + * This happens when we get duplicate sack blocks with the + * same end. For example SACK 4: 100 SACK 3: 100 The sort + * will not change there location so we would just start at + * the end of the first one and get lost. + */ + if (tp->t_flags & TF_SENTFIN) { + /* + * Check to see if we have not logged the FIN that + * went out. + */ + nrsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_map, rack_sendmap, r_next); + if (nrsm && (nrsm->r_end + 1) == tp->snd_max) { + /* + * Ok we did not get the FIN logged. + */ + nrsm->r_end++; + rsm = nrsm; + goto do_rest_ofb; + } + } + if (times == 1) { +#ifdef INVARIANTS + panic("tp:%p rack:%p sack:%p to:%p prsm:%p", + tp, rack, sack, to, prsm); +#else + goto out; +#endif + } + times++; + counter_u64_add(rack_sack_proc_restart, 1); + goto start_at_beginning; + } + /* Ok we have an ACK for some piece of rsm */ + if (rsm->r_start != start) { + /* + * Need to split this in two pieces the before and after. + */ + nrsm = rack_alloc(rack); + if (nrsm == NULL) { + /* + * failed XXXrrs what can we do but loose the sack + * info? + */ + goto out; + } + nrsm->r_start = start; + nrsm->r_rtr_bytes = 0; + nrsm->r_end = rsm->r_end; + nrsm->r_rtr_cnt = rsm->r_rtr_cnt; + nrsm->r_flags = rsm->r_flags; + nrsm->r_sndcnt = rsm->r_sndcnt; + for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) { + nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx]; + } + rsm->r_end = nrsm->r_start; + TAILQ_INSERT_AFTER(&rack->r_ctl.rc_map, rsm, nrsm, r_next); + if (rsm->r_in_tmap) { + TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); + nrsm->r_in_tmap = 1; + } + rsm->r_flags &= (~RACK_HAS_FIN); + rsm = nrsm; + } + if (SEQ_GEQ(end, rsm->r_end)) { + /* + * The end of this block is either beyond this guy or right + * at this guy. + */ + + if ((rsm->r_flags & RACK_ACKED) == 0) { + rack_update_rtt(tp, rack, rsm, to, cts, SACKED); + changed += (rsm->r_end - rsm->r_start); + rack->r_ctl.rc_sacked += (rsm->r_end - rsm->r_start); + rack_log_sack_passed(tp, rack, rsm); + /* Is Reordering occuring? */ + if (rsm->r_flags & RACK_SACK_PASSED) { + counter_u64_add(rack_reorder_seen, 1); + rack->r_ctl.rc_reorder_ts = cts; + } + rsm->r_flags |= RACK_ACKED; + rsm->r_flags &= ~RACK_TLP; + if (rsm->r_in_tmap) { + TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext); + rsm->r_in_tmap = 0; + } + } + if (end == rsm->r_end) { + /* This block only - done */ + goto out; + } + /* There is more not coverend by this rsm move on */ + start = rsm->r_end; + nrsm = TAILQ_NEXT(rsm, r_next); + rsm = nrsm; + times = 0; + goto do_rest_ofb; + } + /* Ok we need to split off this one at the tail */ + nrsm = rack_alloc(rack); + if (nrsm == NULL) { + /* failed rrs what can we do but loose the sack info? */ + goto out; + } + /* Clone it */ + nrsm->r_start = end; + nrsm->r_end = rsm->r_end; + nrsm->r_rtr_bytes = 0; + nrsm->r_rtr_cnt = rsm->r_rtr_cnt; + nrsm->r_flags = rsm->r_flags; + nrsm->r_sndcnt = rsm->r_sndcnt; + for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) { + nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx]; + } + /* The sack block does not cover this guy fully */ + rsm->r_flags &= (~RACK_HAS_FIN); + rsm->r_end = end; + TAILQ_INSERT_AFTER(&rack->r_ctl.rc_map, rsm, nrsm, r_next); + if (rsm->r_in_tmap) { + TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); + nrsm->r_in_tmap = 1; + } + if (rsm->r_flags & RACK_ACKED) { + /* Been here done that */ + goto out; + } + rack_update_rtt(tp, rack, rsm, to, cts, SACKED); + changed += (rsm->r_end - rsm->r_start); + rack->r_ctl.rc_sacked += (rsm->r_end - rsm->r_start); + rack_log_sack_passed(tp, rack, rsm); + /* Is Reordering occuring? */ + if (rsm->r_flags & RACK_SACK_PASSED) { + counter_u64_add(rack_reorder_seen, 1); + rack->r_ctl.rc_reorder_ts = cts; + } + rsm->r_flags |= RACK_ACKED; + rsm->r_flags &= ~RACK_TLP; + if (rsm->r_in_tmap) { + TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext); + rsm->r_in_tmap = 0; + } +out: + if (used_ref == 0) { + counter_u64_add(rack_sack_proc_all, 1); + } else { + counter_u64_add(rack_sack_proc_short, 1); + } + /* Save off where we last were */ + if (rsm) + rack->r_ctl.rc_sacklast = TAILQ_NEXT(rsm, r_next); + else + rack->r_ctl.rc_sacklast = NULL; + *prsm = rsm; + return (changed); +} + +static void inline +rack_peer_reneges(struct tcp_rack *rack, struct rack_sendmap *rsm, tcp_seq th_ack) +{ + struct rack_sendmap *tmap; + + tmap = NULL; + while (rsm && (rsm->r_flags & RACK_ACKED)) { + /* Its no longer sacked, mark it so */ + rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start); +#ifdef INVARIANTS + if (rsm->r_in_tmap) { + panic("rack:%p rsm:%p flags:0x%x in tmap?", + rack, rsm, rsm->r_flags); + } +#endif + rsm->r_flags &= ~(RACK_ACKED|RACK_SACK_PASSED|RACK_WAS_SACKPASS); + /* Rebuild it into our tmap */ + if (tmap == NULL) { + TAILQ_INSERT_HEAD(&rack->r_ctl.rc_tmap, rsm, r_tnext); + tmap = rsm; + } else { + TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, tmap, rsm, r_tnext); + tmap = rsm; + } + tmap->r_in_tmap = 1; + rsm = TAILQ_NEXT(rsm, r_next); + } + /* + * Now lets possibly clear the sack filter so we start + * recognizing sacks that cover this area. + */ + if (rack_use_sack_filter) + sack_filter_clear(&rack->r_ctl.rack_sf, th_ack); + +} + +static void +rack_log_ack(struct tcpcb *tp, struct tcpopt *to, struct tcphdr *th) +{ + uint32_t changed, last_seq, entered_recovery = 0; + struct tcp_rack *rack; + struct rack_sendmap *rsm; + struct sackblk sack, sack_blocks[TCP_MAX_SACK + 1]; + register uint32_t th_ack; + int32_t i, j, k, num_sack_blks = 0; + uint32_t cts, acked, ack_point, sack_changed = 0; + + INP_WLOCK_ASSERT(tp->t_inpcb); + if (th->th_flags & TH_RST) { + /* We don't log resets */ + return; + } + rack = (struct tcp_rack *)tp->t_fb_ptr; + cts = tcp_ts_getticks(); + rsm = TAILQ_FIRST(&rack->r_ctl.rc_map); + changed = 0; + th_ack = th->th_ack; + + if (SEQ_GT(th_ack, tp->snd_una)) { + rack_log_progress_event(rack, tp, ticks, PROGRESS_UPDATE, __LINE__); + tp->t_acktime = ticks; + } + if (rsm && SEQ_GT(th_ack, rsm->r_start)) + changed = th_ack - rsm->r_start; + if (changed) { + /* + * The ACK point is advancing to th_ack, we must drop off + * the packets in the rack log and calculate any eligble + * RTT's. + */ + rack->r_wanted_output++; +more: + rsm = TAILQ_FIRST(&rack->r_ctl.rc_map); + if (rsm == NULL) { + if ((th_ack - 1) == tp->iss) { + /* + * For the SYN incoming case we will not + * have called tcp_output for the sending of + * the SYN, so there will be no map. All + * other cases should probably be a panic. + */ + goto proc_sack; + } + if (tp->t_flags & TF_SENTFIN) { + /* if we send a FIN we will not hav a map */ + goto proc_sack; + } +#ifdef INVARIANTS + panic("No rack map tp:%p for th:%p state:%d rack:%p snd_una:%u snd_max:%u snd_nxt:%u chg:%d\n", + tp, + th, tp->t_state, rack, + tp->snd_una, tp->snd_max, tp->snd_nxt, changed); +#endif + goto proc_sack; + } + if (SEQ_LT(th_ack, rsm->r_start)) { + /* Huh map is missing this */ +#ifdef INVARIANTS + printf("Rack map starts at r_start:%u for th_ack:%u huh? ts:%d rs:%d\n", + rsm->r_start, + th_ack, tp->t_state, rack->r_state); +#endif + goto proc_sack; + } + rack_update_rtt(tp, rack, rsm, to, cts, CUM_ACKED); + /* Now do we consume the whole thing? */ + if (SEQ_GEQ(th_ack, rsm->r_end)) { + /* Its all consumed. */ + uint32_t left; + + rack->r_ctl.rc_holes_rxt -= rsm->r_rtr_bytes; + rsm->r_rtr_bytes = 0; + TAILQ_REMOVE(&rack->r_ctl.rc_map, rsm, r_next); + if (rsm->r_in_tmap) { + TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext); + rsm->r_in_tmap = 0; + } + if (rack->r_ctl.rc_next == rsm) { + /* scoot along the marker */ + rack->r_ctl.rc_next = TAILQ_FIRST(&rack->r_ctl.rc_map); + } + if (rsm->r_flags & RACK_ACKED) { + /* + * It was acked on the scoreboard -- remove + * it from total + */ + rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start); + } else if (rsm->r_flags & RACK_SACK_PASSED) { + /* + * There are acked segments ACKED on the + * scoreboard further up. We are seeing + * reordering. + */ + counter_u64_add(rack_reorder_seen, 1); + rsm->r_flags |= RACK_ACKED; + rack->r_ctl.rc_reorder_ts = cts; + } + left = th_ack - rsm->r_end; + if (rsm->r_rtr_cnt > 1) { + /* + * Technically we should make r_rtr_cnt be + * monotonicly increasing and just mod it to + * the timestamp it is replacing.. that way + * we would have the last 3 retransmits. Now + * rc_loss_count will be wrong if we + * retransmit something more than 2 times in + * recovery :( + */ + rack->r_ctl.rc_loss_count += (rsm->r_rtr_cnt - 1); + } + /* Free back to zone */ + rack_free(rack, rsm); + if (left) { + goto more; + } + goto proc_sack; + } + if (rsm->r_flags & RACK_ACKED) { + /* + * It was acked on the scoreboard -- remove it from + * total for the part being cum-acked. + */ + rack->r_ctl.rc_sacked -= (th_ack - rsm->r_start); + } + rack->r_ctl.rc_holes_rxt -= rsm->r_rtr_bytes; + rsm->r_rtr_bytes = 0; + rsm->r_start = th_ack; + } +proc_sack: + /* Check for reneging */ + rsm = TAILQ_FIRST(&rack->r_ctl.rc_map); + if (rsm && (rsm->r_flags & RACK_ACKED) && (th_ack == rsm->r_start)) { + /* + * The peer has moved snd_una up to + * the edge of this send, i.e. one + * that it had previously acked. The only + * way that can be true if the peer threw + * away data (space issues) that it had + * previously sacked (else it would have + * given us snd_una up to (rsm->r_end). + * We need to undo the acked markings here. + * + * Note we have to look to make sure th_ack is + * our rsm->r_start in case we get an old ack + * where th_ack is behind snd_una. + */ + rack_peer_reneges(rack, rsm, th->th_ack); + } + if ((to->to_flags & TOF_SACK) == 0) { + /* We are done nothing left to log */ + goto out; + } + rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_map, rack_sendmap, r_next); + if (rsm) { + last_seq = rsm->r_end; + } else { + last_seq = tp->snd_max; + } + /* Sack block processing */ + if (SEQ_GT(th_ack, tp->snd_una)) + ack_point = th_ack; + else + ack_point = tp->snd_una; + for (i = 0; i < to->to_nsacks; i++) { + bcopy((to->to_sacks + i * TCPOLEN_SACK), + &sack, sizeof(sack)); + sack.start = ntohl(sack.start); + sack.end = ntohl(sack.end); + if (SEQ_GT(sack.end, sack.start) && + SEQ_GT(sack.start, ack_point) && + SEQ_LT(sack.start, tp->snd_max) && + SEQ_GT(sack.end, ack_point) && + SEQ_LEQ(sack.end, tp->snd_max)) { + if ((rack->r_ctl.rc_num_maps_alloced > rack_sack_block_limit) && + (SEQ_LT(sack.end, last_seq)) && + ((sack.end - sack.start) < (tp->t_maxseg / 8))) { + /* + * Not the last piece and its smaller than + * 1/8th of a MSS. We ignore this. + */ + counter_u64_add(rack_runt_sacks, 1); + continue; + } + sack_blocks[num_sack_blks] = sack; + num_sack_blks++; +#ifdef NETFLIX_STATS + } else if (SEQ_LEQ(sack.start, th_ack) && + SEQ_LEQ(sack.end, th_ack)) { + /* + * Its a D-SACK block. + */ + tcp_record_dsack(sack.start, sack.end); +#endif + } + + } + if (num_sack_blks == 0) + goto out; + /* + * Sort the SACK blocks so we can update the rack scoreboard with + * just one pass. + */ + if (rack_use_sack_filter) { + num_sack_blks = sack_filter_blks(&rack->r_ctl.rack_sf, sack_blocks, num_sack_blks, th->th_ack); + } + if (num_sack_blks < 2) { + goto do_sack_work; + } + /* Sort the sacks */ + for (i = 0; i < num_sack_blks; i++) { + for (j = i + 1; j < num_sack_blks; j++) { + if (SEQ_GT(sack_blocks[i].end, sack_blocks[j].end)) { + sack = sack_blocks[i]; + sack_blocks[i] = sack_blocks[j]; + sack_blocks[j] = sack; + } + } + } + /* + * Now are any of the sack block ends the same (yes some + * implememtations send these)? + */ +again: + if (num_sack_blks > 1) { + for (i = 0; i < num_sack_blks; i++) { + for (j = i + 1; j < num_sack_blks; j++) { + if (sack_blocks[i].end == sack_blocks[j].end) { + /* + * Ok these two have the same end we + * want the smallest end and then + * throw away the larger and start + * again. + */ + if (SEQ_LT(sack_blocks[j].start, sack_blocks[i].start)) { + /* + * The second block covers + * more area use that + */ + sack_blocks[i].start = sack_blocks[j].start; + } + /* + * Now collapse out the dup-sack and + * lower the count + */ + for (k = (j + 1); k < num_sack_blks; k++) { + sack_blocks[j].start = sack_blocks[k].start; + sack_blocks[j].end = sack_blocks[k].end; + j++; + } + num_sack_blks--; + goto again; + } + } + } + } +do_sack_work: + rsm = rack->r_ctl.rc_sacklast; + for (i = 0; i < num_sack_blks; i++) { + acked = rack_proc_sack_blk(tp, rack, &sack_blocks[i], to, &rsm, cts); + if (acked) { + rack->r_wanted_output++; + changed += acked; + sack_changed += acked; + } + } +out: + if (changed) { + /* Something changed cancel the rack timer */ + rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); + } + if ((sack_changed) && (!IN_RECOVERY(tp->t_flags))) { + /* + * Ok we have a high probability that we need to go in to + * recovery since we have data sack'd + */ + struct rack_sendmap *rsm; + uint32_t tsused; + + tsused = tcp_ts_getticks(); + rsm = tcp_rack_output(tp, rack, tsused); + if (rsm) { + /* Enter recovery */ + rack->r_ctl.rc_rsm_start = rsm->r_start; + rack->r_ctl.rc_cwnd_at = tp->snd_cwnd; + rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh; + entered_recovery = 1; + rack_cong_signal(tp, NULL, CC_NDUPACK); + /* + * When we enter recovery we need to assure we send + * one packet. + */ + rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg; + rack->r_timer_override = 1; + } + } + if (IN_RECOVERY(tp->t_flags) && (entered_recovery == 0)) { + /* Deal with changed an PRR here (in recovery only) */ + uint32_t pipe, snd_una; + + rack->r_ctl.rc_prr_delivered += changed; + /* Compute prr_sndcnt */ + if (SEQ_GT(tp->snd_una, th_ack)) { + snd_una = tp->snd_una; + } else { + snd_una = th_ack; + } + pipe = ((tp->snd_max - snd_una) - rack->r_ctl.rc_sacked) + rack->r_ctl.rc_holes_rxt; + if (pipe > tp->snd_ssthresh) { + long sndcnt; + + sndcnt = rack->r_ctl.rc_prr_delivered * tp->snd_ssthresh; + if (rack->r_ctl.rc_prr_recovery_fs > 0) + sndcnt /= (long)rack->r_ctl.rc_prr_recovery_fs; + else { + rack->r_ctl.rc_prr_sndcnt = 0; + sndcnt = 0; + } + sndcnt++; + if (sndcnt > (long)rack->r_ctl.rc_prr_out) + sndcnt -= rack->r_ctl.rc_prr_out; + else + sndcnt = 0; + rack->r_ctl.rc_prr_sndcnt = sndcnt; + } else { + uint32_t limit; + + if (rack->r_ctl.rc_prr_delivered > rack->r_ctl.rc_prr_out) + limit = (rack->r_ctl.rc_prr_delivered - rack->r_ctl.rc_prr_out); + else + limit = 0; + if (changed > limit) + limit = changed; + limit += tp->t_maxseg; + if (tp->snd_ssthresh > pipe) { + rack->r_ctl.rc_prr_sndcnt = min((tp->snd_ssthresh - pipe), limit); + } else { + rack->r_ctl.rc_prr_sndcnt = min(0, limit); + } + } + if (rack->r_ctl.rc_prr_sndcnt >= tp->t_maxseg) { + rack->r_timer_override = 1; + } + } +} + +/* + * Return value of 1, we do not need to call rack_process_data(). + * return value of 0, rack_process_data can be called. + * For ret_val if its 0 the TCP is locked, if its non-zero + * its unlocked and probably unsafe to touch the TCB. + */ +static int +rack_process_ack(struct mbuf *m, struct tcphdr *th, struct socket *so, + struct tcpcb *tp, struct tcpopt *to, + int32_t * ti_locked, uint32_t tiwin, int32_t tlen, + int32_t * ofia, int32_t thflags, int32_t * ret_val) +{ + int32_t ourfinisacked = 0; + int32_t nsegs, acked_amount; + int32_t acked; + struct mbuf *mfree; + struct tcp_rack *rack; + int32_t recovery = 0; + + rack = (struct tcp_rack *)tp->t_fb_ptr; + if (SEQ_GT(th->th_ack, tp->snd_max)) { + rack_do_dropafterack(m, tp, th, ti_locked, thflags, tlen, ret_val); + return (1); + } + if (SEQ_GEQ(th->th_ack, tp->snd_una) || to->to_nsacks) { + rack_log_ack(tp, to, th); + } + if (__predict_false(SEQ_LEQ(th->th_ack, tp->snd_una))) { + /* + * Old ack, behind (or duplicate to) the last one rcv'd + * Note: Should mark reordering is occuring! We should also + * look for sack blocks arriving e.g. ack 1, 4-4 then ack 1, + * 3-3, 4-4 would be reording. As well as ack 1, 3-3 ack 3 + */ + return (0); + } + /* + * If we reach this point, ACK is not a duplicate, i.e., it ACKs + * something we sent. + */ + if (tp->t_flags & TF_NEEDSYN) { + /* + * T/TCP: Connection was half-synchronized, and our SYN has + * been ACK'd (so connection is now fully synchronized). Go + * to non-starred state, increment snd_una for ACK of SYN, + * and check if we can do window scaling. + */ + tp->t_flags &= ~TF_NEEDSYN; + tp->snd_una++; + /* Do window scaling? */ + if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) == + (TF_RCVD_SCALE | TF_REQ_SCALE)) { + tp->rcv_scale = tp->request_r_scale; + /* Send window already scaled. */ + } + } + nsegs = max(1, m->m_pkthdr.lro_nsegs); + INP_WLOCK_ASSERT(tp->t_inpcb); + + acked = BYTES_THIS_ACK(tp, th); + TCPSTAT_ADD(tcps_rcvackpack, nsegs); + TCPSTAT_ADD(tcps_rcvackbyte, acked); + + /* + * If we just performed our first retransmit, and the ACK arrives + * within our recovery window, then it was a mistake to do the + * retransmit in the first place. Recover our original cwnd and + * ssthresh, and proceed to transmit where we left off. + */ + if (tp->t_flags & TF_PREVVALID) { + tp->t_flags &= ~TF_PREVVALID; + if (tp->t_rxtshift == 1 && + (int)(ticks - tp->t_badrxtwin) < 0) + rack_cong_signal(tp, th, CC_RTO_ERR); + } + /* + * If we have a timestamp reply, update smoothed round trip time. If + * no timestamp is present but transmit timer is running and timed + * sequence number was acked, update smoothed round trip time. Since + * we now have an rtt measurement, cancel the timer backoff (cf., + * Phil Karn's retransmit alg.). Recompute the initial retransmit + * timer. + * + * Some boxes send broken timestamp replies during the SYN+ACK + * phase, ignore timestamps of 0 or we could calculate a huge RTT + * and blow up the retransmit timer. + */ + /* + * If all outstanding data is acked, stop retransmit timer and + * remember to restart (more output or persist). If there is more + * data to be acked, restart retransmit timer, using current + * (possibly backed-off) value. + */ + if (th->th_ack == tp->snd_max) { + rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); + rack->r_wanted_output++; + } + /* + * If no data (only SYN) was ACK'd, skip rest of ACK processing. + */ + if (acked == 0) { + if (ofia) + *ofia = ourfinisacked; + return (0); + } + if (rack->r_ctl.rc_early_recovery) { + if (IN_FASTRECOVERY(tp->t_flags)) { + if (SEQ_LT(th->th_ack, tp->snd_recover)) { + tcp_rack_partialack(tp, th); + } else { + rack_post_recovery(tp, th); + recovery = 1; + } + } + } + /* + * Let the congestion control algorithm update congestion control + * related information. This typically means increasing the + * congestion window. + */ + rack_ack_received(tp, rack, th, nsegs, CC_ACK, recovery); + SOCKBUF_LOCK(&so->so_snd); + acked_amount = min(acked, (int)sbavail(&so->so_snd)); + tp->snd_wnd -= acked_amount; + mfree = sbcut_locked(&so->so_snd, acked_amount); + if ((sbused(&so->so_snd) == 0) && + (acked > acked_amount) && + (tp->t_state >= TCPS_FIN_WAIT_1)) { + ourfinisacked = 1; + } + /* NB: sowwakeup_locked() does an implicit unlock. */ + sowwakeup_locked(so); + m_freem(mfree); + if (rack->r_ctl.rc_early_recovery == 0) { + if (IN_FASTRECOVERY(tp->t_flags)) { + if (SEQ_LT(th->th_ack, tp->snd_recover)) { + tcp_rack_partialack(tp, th); + } else { + rack_post_recovery(tp, th); + } + } + } + tp->snd_una = th->th_ack; + if (SEQ_GT(tp->snd_una, tp->snd_recover)) + tp->snd_recover = tp->snd_una; + + if (SEQ_LT(tp->snd_nxt, tp->snd_una)) { + tp->snd_nxt = tp->snd_una; + } + if (tp->snd_una == tp->snd_max) { + /* Nothing left outstanding */ + rack_log_progress_event(rack, tp, 0, PROGRESS_CLEAR, __LINE__); + tp->t_acktime = 0; + rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); + /* Set need output so persist might get set */ + rack->r_wanted_output++; + if (rack_use_sack_filter) + sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una); + if ((tp->t_state >= TCPS_FIN_WAIT_1) && + (sbavail(&so->so_snd) == 0) && + (tp->t_flags2 & TF2_DROP_AF_DATA)) { + /* + * The socket was gone and the + * peer sent data, time to + * reset him. + */ + *ret_val = 1; + tp = tcp_close(tp); + rack_do_dropwithreset(m, tp, th, ti_locked, BANDLIM_UNLIMITED, tlen); + return (1); + } + } + if (ofia) + *ofia = ourfinisacked; + return (0); +} + + +/* + * Return value of 1, the TCB is unlocked and most + * likely gone, return value of 0, the TCP is still + * locked. + */ +static int +rack_process_data(struct mbuf *m, struct tcphdr *th, struct socket *so, + struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, + int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) +{ + /* + * Update window information. Don't look at window if no ACK: TAC's + * send garbage on first SYN. + */ + int32_t nsegs; +#ifdef TCP_RFC7413 + int32_t tfo_syn; +#else +#define tfo_syn (FALSE) +#endif + struct tcp_rack *rack; + + rack = (struct tcp_rack *)tp->t_fb_ptr; + INP_WLOCK_ASSERT(tp->t_inpcb); + + nsegs = max(1, m->m_pkthdr.lro_nsegs); + if ((thflags & TH_ACK) && + (SEQ_LT(tp->snd_wl1, th->th_seq) || + (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) || + (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) { + /* keep track of pure window updates */ + if (tlen == 0 && + tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) + TCPSTAT_INC(tcps_rcvwinupd); + tp->snd_wnd = tiwin; + tp->snd_wl1 = th->th_seq; + tp->snd_wl2 = th->th_ack; + if (tp->snd_wnd > tp->max_sndwnd) + tp->max_sndwnd = tp->snd_wnd; + rack->r_wanted_output++; + } else if (thflags & TH_ACK) { + if ((tp->snd_wl2 == th->th_ack) && (tiwin < tp->snd_wnd)) { + tp->snd_wnd = tiwin; + tp->snd_wl1 = th->th_seq; + tp->snd_wl2 = th->th_ack; + } + } + /* Was persist timer active and now we have window space? */ + if ((rack->rc_in_persist != 0) && tp->snd_wnd) { + rack_exit_persist(tp, rack); + tp->snd_nxt = tp->snd_max; + /* Make sure we output to start the timer */ + rack->r_wanted_output++; + } + /* + * Process segments with URG. + */ + if ((thflags & TH_URG) && th->th_urp && + TCPS_HAVERCVDFIN(tp->t_state) == 0) { + /* + * This is a kludge, but if we receive and accept random + * urgent pointers, we'll crash in soreceive. It's hard to + * imagine someone actually wanting to send this much urgent + * data. + */ + SOCKBUF_LOCK(&so->so_rcv); + if (th->th_urp + sbavail(&so->so_rcv) > sb_max) { + th->th_urp = 0; /* XXX */ + thflags &= ~TH_URG; /* XXX */ + SOCKBUF_UNLOCK(&so->so_rcv); /* XXX */ + goto dodata; /* XXX */ + } + /* + * If this segment advances the known urgent pointer, then + * mark the data stream. This should not happen in + * CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since a + * FIN has been received from the remote side. In these + * states we ignore the URG. + * + * According to RFC961 (Assigned Protocols), the urgent + * pointer points to the last octet of urgent data. We + * continue, however, to consider it to indicate the first + * octet of data past the urgent section as the original + * spec states (in one of two places). + */ + if (SEQ_GT(th->th_seq + th->th_urp, tp->rcv_up)) { + tp->rcv_up = th->th_seq + th->th_urp; + so->so_oobmark = sbavail(&so->so_rcv) + + (tp->rcv_up - tp->rcv_nxt) - 1; + if (so->so_oobmark == 0) + so->so_rcv.sb_state |= SBS_RCVATMARK; + sohasoutofband(so); + tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA); + } + SOCKBUF_UNLOCK(&so->so_rcv); + /* + * Remove out of band data so doesn't get presented to user. + * This can happen independent of advancing the URG pointer, + * but if two URG's are pending at once, some out-of-band + * data may creep in... ick. + */ + if (th->th_urp <= (uint32_t) tlen && + !(so->so_options & SO_OOBINLINE)) { + /* hdr drop is delayed */ + tcp_pulloutofband(so, th, m, drop_hdrlen); + } + } else { + /* + * If no out of band data is expected, pull receive urgent + * pointer along with the receive window. + */ + if (SEQ_GT(tp->rcv_nxt, tp->rcv_up)) + tp->rcv_up = tp->rcv_nxt; + } +dodata: /* XXX */ + INP_WLOCK_ASSERT(tp->t_inpcb); + + /* + * Process the segment text, merging it into the TCP sequencing + * queue, and arranging for acknowledgment of receipt if necessary. + * This process logically involves adjusting tp->rcv_wnd as data is + * presented to the user (this happens in tcp_usrreq.c, case + * PRU_RCVD). If a FIN has already been received on this connection + * then we just ignore the text. + */ +#ifdef TCP_RFC7413 + tfo_syn = ((tp->t_state == TCPS_SYN_RECEIVED) && + (tp->t_flags & TF_FASTOPEN)); +#endif + if ((tlen || (thflags & TH_FIN) || tfo_syn) && + TCPS_HAVERCVDFIN(tp->t_state) == 0) { + tcp_seq save_start = th->th_seq; + + m_adj(m, drop_hdrlen); /* delayed header drop */ + /* + * Insert segment which includes th into TCP reassembly + * queue with control block tp. Set thflags to whether + * reassembly now includes a segment with FIN. This handles + * the common case inline (segment is the next to be + * received on an established connection, and the queue is + * empty), avoiding linkage into and removal from the queue + * and repetition of various conversions. Set DELACK for + * segments received in order, but ack immediately when + * segments are out of order (so fast retransmit can work). + */ + if (th->th_seq == tp->rcv_nxt && + LIST_EMPTY(&tp->t_segq) && + (TCPS_HAVEESTABLISHED(tp->t_state) || + tfo_syn)) { + if (DELAY_ACK(tp, tlen) || tfo_syn) { + rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); + tp->t_flags |= TF_DELACK; + } else { + rack->r_wanted_output++; + tp->t_flags |= TF_ACKNOW; + } + tp->rcv_nxt += tlen; + thflags = th->th_flags & TH_FIN; + TCPSTAT_ADD(tcps_rcvpack, nsegs); + TCPSTAT_ADD(tcps_rcvbyte, tlen); + SOCKBUF_LOCK(&so->so_rcv); + if (so->so_rcv.sb_state & SBS_CANTRCVMORE) + m_freem(m); + else + sbappendstream_locked(&so->so_rcv, m, 0); + /* NB: sorwakeup_locked() does an implicit unlock. */ + sorwakeup_locked(so); + } else { + /* + * XXX: Due to the header drop above "th" is + * theoretically invalid by now. Fortunately + * m_adj() doesn't actually frees any mbufs when + * trimming from the head. + */ + thflags = tcp_reass(tp, th, &tlen, m); + tp->t_flags |= TF_ACKNOW; + } + if (tlen > 0) + tcp_update_sack_list(tp, save_start, save_start + tlen); + } else { + m_freem(m); + thflags &= ~TH_FIN; + } + + /* + * If FIN is received ACK the FIN and let the user know that the + * connection is closing. + */ + if (thflags & TH_FIN) { + if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { + socantrcvmore(so); + /* + * If connection is half-synchronized (ie NEEDSYN + * flag on) then delay ACK, so it may be piggybacked + * when SYN is sent. Otherwise, since we received a + * FIN then no more input can be expected, send ACK + * now. + */ + if (tp->t_flags & TF_NEEDSYN) { + rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); + tp->t_flags |= TF_DELACK; + } else { + tp->t_flags |= TF_ACKNOW; + } + tp->rcv_nxt++; + } + switch (tp->t_state) { + + /* + * In SYN_RECEIVED and ESTABLISHED STATES enter the + * CLOSE_WAIT state. + */ + case TCPS_SYN_RECEIVED: + tp->t_starttime = ticks; + /* FALLTHROUGH */ + case TCPS_ESTABLISHED: + rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); + tcp_state_change(tp, TCPS_CLOSE_WAIT); + break; + + /* + * If still in FIN_WAIT_1 STATE FIN has not been + * acked so enter the CLOSING state. + */ + case TCPS_FIN_WAIT_1: + rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); + tcp_state_change(tp, TCPS_CLOSING); + break; + + /* + * In FIN_WAIT_2 state enter the TIME_WAIT state, + * starting the time-wait timer, turning off the + * other standard timers. + */ + case TCPS_FIN_WAIT_2: + rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); + KASSERT(*ti_locked == TI_RLOCKED, ("%s: dodata " + "TCP_FIN_WAIT_2 ti_locked: %d", __func__, + *ti_locked)); + tcp_twstart(tp); + *ti_locked = TI_UNLOCKED; + INP_INFO_RUNLOCK(&V_tcbinfo); + return (1); + } + } + if (*ti_locked == TI_RLOCKED) { + INP_INFO_RUNLOCK(&V_tcbinfo); + *ti_locked = TI_UNLOCKED; + } + /* + * Return any desired output. + */ + if ((tp->t_flags & TF_ACKNOW) || (sbavail(&so->so_snd) > (tp->snd_max - tp->snd_una))) { + rack->r_wanted_output++; + } + KASSERT(*ti_locked == TI_UNLOCKED, ("%s: check_delack ti_locked %d", + __func__, *ti_locked)); + INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); + INP_WLOCK_ASSERT(tp->t_inpcb); + return (0); +} + +/* + * Here nothing is really faster, its just that we + * have broken out the fast-data path also just like + * the fast-ack. + */ +static int +rack_do_fastnewdata(struct mbuf *m, struct tcphdr *th, struct socket *so, + struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, + int32_t * ti_locked, uint32_t tiwin, int32_t nxt_pkt) +{ + int32_t nsegs; + int32_t newsize = 0; /* automatic sockbuf scaling */ + struct tcp_rack *rack; +#ifdef TCPDEBUG + /* + * The size of tcp_saveipgen must be the size of the max ip header, + * now IPv6. + */ + u_char tcp_saveipgen[IP6_HDR_LEN]; + struct tcphdr tcp_savetcp; + short ostate = 0; + +#endif + /* + * If last ACK falls within this segment's sequence numbers, record + * the timestamp. NOTE that the test is modified according to the + * latest proposal of the tcplw@cray.com list (Braden 1993/04/26). + */ + if (__predict_false(th->th_seq != tp->rcv_nxt)) { + return (0); + } + if (__predict_false(tp->snd_nxt != tp->snd_max)) { + return (0); + } + if (tiwin && tiwin != tp->snd_wnd) { + return (0); + } + if (__predict_false((tp->t_flags & (TF_NEEDSYN | TF_NEEDFIN)))) { + return (0); + } + if (__predict_false((to->to_flags & TOF_TS) && + (TSTMP_LT(to->to_tsval, tp->ts_recent)))) { + return (0); + } + if (__predict_false((th->th_ack != tp->snd_una))) { + return (0); + } + if (__predict_false(tlen > sbspace(&so->so_rcv))) { + return (0); + } + if ((to->to_flags & TOF_TS) != 0 && + SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { + tp->ts_recent_age = tcp_ts_getticks(); + tp->ts_recent = to->to_tsval; + } + rack = (struct tcp_rack *)tp->t_fb_ptr; + /* + * This is a pure, in-sequence data packet with nothing on the + * reassembly queue and we have enough buffer space to take it. + */ + if (*ti_locked == TI_RLOCKED) { + INP_INFO_RUNLOCK(&V_tcbinfo); + *ti_locked = TI_UNLOCKED; + } + nsegs = max(1, m->m_pkthdr.lro_nsegs); + + + /* Clean receiver SACK report if present */ + if (tp->rcv_numsacks) + tcp_clean_sackreport(tp); + TCPSTAT_INC(tcps_preddat); + tp->rcv_nxt += tlen; + /* + * Pull snd_wl1 up to prevent seq wrap relative to th_seq. + */ + tp->snd_wl1 = th->th_seq; + /* + * Pull rcv_up up to prevent seq wrap relative to rcv_nxt. + */ + tp->rcv_up = tp->rcv_nxt; + TCPSTAT_ADD(tcps_rcvpack, nsegs); + TCPSTAT_ADD(tcps_rcvbyte, tlen); +#ifdef TCPDEBUG + if (so->so_options & SO_DEBUG) + tcp_trace(TA_INPUT, ostate, tp, + (void *)tcp_saveipgen, &tcp_savetcp, 0); +#endif + newsize = tcp_autorcvbuf(m, th, so, tp, tlen); + + /* Add data to socket buffer. */ + SOCKBUF_LOCK(&so->so_rcv); + if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { + m_freem(m); + } else { + /* + * Set new socket buffer size. Give up when limit is + * reached. + */ + if (newsize) + if (!sbreserve_locked(&so->so_rcv, + newsize, so, NULL)) + so->so_rcv.sb_flags &= ~SB_AUTOSIZE; + m_adj(m, drop_hdrlen); /* delayed header drop */ + sbappendstream_locked(&so->so_rcv, m, 0); + rack_calc_rwin(so, tp); + } + /* NB: sorwakeup_locked() does an implicit unlock. */ + sorwakeup_locked(so); + if (DELAY_ACK(tp, tlen)) { + rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); + tp->t_flags |= TF_DELACK; + } else { + tp->t_flags |= TF_ACKNOW; + rack->r_wanted_output++; + } + if ((tp->snd_una == tp->snd_max) && rack_use_sack_filter) + sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una); + return (1); +} + +/* + * This subfunction is used to try to highly optimize the + * fast path. We again allow window updates that are + * in sequence to remain in the fast-path. We also add + * in the __predict's to attempt to help the compiler. + * Note that if we return a 0, then we can *not* process + * it and the caller should push the packet into the + * slow-path. + */ +static int +rack_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so, + struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, + int32_t * ti_locked, uint32_t tiwin, int32_t nxt_pkt, uint32_t cts) +{ + int32_t acked; + int32_t nsegs; + +#ifdef TCPDEBUG + /* + * The size of tcp_saveipgen must be the size of the max ip header, + * now IPv6. + */ + u_char tcp_saveipgen[IP6_HDR_LEN]; + struct tcphdr tcp_savetcp; + short ostate = 0; + +#endif + struct tcp_rack *rack; + + if (__predict_false(SEQ_LEQ(th->th_ack, tp->snd_una))) { + /* Old ack, behind (or duplicate to) the last one rcv'd */ + return (0); + } + if (__predict_false(SEQ_GT(th->th_ack, tp->snd_max))) { + /* Above what we have sent? */ + return (0); + } + if (__predict_false(tp->snd_nxt != tp->snd_max)) { + /* We are retransmitting */ + return (0); + } + if (__predict_false(tiwin == 0)) { + /* zero window */ + return (0); + } + if (__predict_false(tp->t_flags & (TF_NEEDSYN | TF_NEEDFIN))) { + /* We need a SYN or a FIN, unlikely.. */ + return (0); + } + if ((to->to_flags & TOF_TS) && __predict_false(TSTMP_LT(to->to_tsval, tp->ts_recent))) { + /* Timestamp is behind .. old ack with seq wrap? */ + return (0); + } + if (__predict_false(IN_RECOVERY(tp->t_flags))) { + /* Still recovering */ + return (0); + } + rack = (struct tcp_rack *)tp->t_fb_ptr; + if (rack->r_ctl.rc_sacked) { + /* We have sack holes on our scoreboard */ + return (0); + } + /* Ok if we reach here, we can process a fast-ack */ + nsegs = max(1, m->m_pkthdr.lro_nsegs); + rack_log_ack(tp, to, th); + /* Did the window get updated? */ + if (tiwin != tp->snd_wnd) { + tp->snd_wnd = tiwin; + tp->snd_wl1 = th->th_seq; + if (tp->snd_wnd > tp->max_sndwnd) + tp->max_sndwnd = tp->snd_wnd; + } + if ((rack->rc_in_persist != 0) && (tp->snd_wnd >= tp->t_maxseg)) { + rack_exit_persist(tp, rack); + } + /* + * If last ACK falls within this segment's sequence numbers, record + * the timestamp. NOTE that the test is modified according to the + * latest proposal of the tcplw@cray.com list (Braden 1993/04/26). + */ + if ((to->to_flags & TOF_TS) != 0 && + SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { + tp->ts_recent_age = tcp_ts_getticks(); + tp->ts_recent = to->to_tsval; + } + /* + * This is a pure ack for outstanding data. + */ + if (*ti_locked == TI_RLOCKED) { + INP_INFO_RUNLOCK(&V_tcbinfo); + *ti_locked = TI_UNLOCKED; + } + TCPSTAT_INC(tcps_predack); + + /* + * "bad retransmit" recovery. + */ + if (tp->t_flags & TF_PREVVALID) { + tp->t_flags &= ~TF_PREVVALID; + if (tp->t_rxtshift == 1 && + (int)(ticks - tp->t_badrxtwin) < 0) + rack_cong_signal(tp, th, CC_RTO_ERR); + } + /* + * Recalculate the transmit timer / rtt. + * + * Some boxes send broken timestamp replies during the SYN+ACK + * phase, ignore timestamps of 0 or we could calculate a huge RTT + * and blow up the retransmit timer. + */ + acked = BYTES_THIS_ACK(tp, th); + +#ifdef TCP_HHOOK + /* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */ + hhook_run_tcp_est_in(tp, th, to); +#endif + + TCPSTAT_ADD(tcps_rcvackpack, nsegs); + TCPSTAT_ADD(tcps_rcvackbyte, acked); + sbdrop(&so->so_snd, acked); + /* + * Let the congestion control algorithm update congestion control + * related information. This typically means increasing the + * congestion window. + */ + rack_ack_received(tp, rack, th, nsegs, CC_ACK, 0); + + tp->snd_una = th->th_ack; + /* + * Pull snd_wl2 up to prevent seq wrap relative to th_ack. + */ + tp->snd_wl2 = th->th_ack; + tp->t_dupacks = 0; + m_freem(m); + /* ND6_HINT(tp); *//* Some progress has been made. */ + + /* + * If all outstanding data are acked, stop retransmit timer, + * otherwise restart timer using current (possibly backed-off) + * value. If process is waiting for space, wakeup/selwakeup/signal. + * If data are ready to send, let tcp_output decide between more + * output or persist. + */ +#ifdef TCPDEBUG + if (so->so_options & SO_DEBUG) + tcp_trace(TA_INPUT, ostate, tp, + (void *)tcp_saveipgen, + &tcp_savetcp, 0); +#endif + if (tp->snd_una == tp->snd_max) { + rack_log_progress_event(rack, tp, 0, PROGRESS_CLEAR, __LINE__); + tp->t_acktime = 0; + rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); + } + /* Wake up the socket if we have room to write more */ + sowwakeup(so); + if (sbavail(&so->so_snd)) { + rack->r_wanted_output++; + } + return (1); +} + +/* + * Return value of 1, the TCB is unlocked and most + * likely gone, return value of 0, the TCP is still + * locked. + */ +static int +rack_do_syn_sent(struct mbuf *m, struct tcphdr *th, struct socket *so, + struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, + int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) +{ + int32_t ret_val = 0; + int32_t todrop; + int32_t ourfinisacked = 0; + + rack_calc_rwin(so, tp); + /* + * If the state is SYN_SENT: if seg contains an ACK, but not for our + * SYN, drop the input. if seg contains a RST, then drop the + * connection. if seg does not contain SYN, then drop it. Otherwise + * this is an acceptable SYN segment initialize tp->rcv_nxt and + * tp->irs if seg contains ack then advance tp->snd_una if seg + * contains an ECE and ECN support is enabled, the stream is ECN + * capable. if SYN has been acked change to ESTABLISHED else + * SYN_RCVD state arrange for segment to be acked (eventually) + * continue processing rest of data/controls, beginning with URG + */ + if ((thflags & TH_ACK) && + (SEQ_LEQ(th->th_ack, tp->iss) || + SEQ_GT(th->th_ack, tp->snd_max))) { + rack_do_dropwithreset(m, tp, th, ti_locked, BANDLIM_RST_OPENPORT, tlen); + return (1); + } + if ((thflags & (TH_ACK | TH_RST)) == (TH_ACK | TH_RST)) { + TCP_PROBE5(connect__refused, NULL, tp, + mtod(m, const char *), tp, th); + tp = tcp_drop(tp, ECONNREFUSED); + rack_do_drop(m, tp, ti_locked); + return (1); + } + if (thflags & TH_RST) { + rack_do_drop(m, tp, ti_locked); + return (1); + } + if (!(thflags & TH_SYN)) { + rack_do_drop(m, tp, ti_locked); + return (1); + } + tp->irs = th->th_seq; + tcp_rcvseqinit(tp); + if (thflags & TH_ACK) { + TCPSTAT_INC(tcps_connects); + soisconnected(so); +#ifdef MAC + mac_socketpeer_set_from_mbuf(m, so); +#endif + /* Do window scaling on this connection? */ + if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) == + (TF_RCVD_SCALE | TF_REQ_SCALE)) { + tp->rcv_scale = tp->request_r_scale; + } + tp->rcv_adv += min(tp->rcv_wnd, + TCP_MAXWIN << tp->rcv_scale); + /* + * If there's data, delay ACK; if there's also a FIN ACKNOW + * will be turned on later. + */ + if (DELAY_ACK(tp, tlen) && tlen != 0) { + rack_timer_cancel(tp, (struct tcp_rack *)tp->t_fb_ptr, + ((struct tcp_rack *)tp->t_fb_ptr)->r_ctl.rc_rcvtime, __LINE__); + tp->t_flags |= TF_DELACK; + } else { + ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output++; + tp->t_flags |= TF_ACKNOW; + } + + if ((thflags & TH_ECE) && V_tcp_do_ecn) { + tp->t_flags |= TF_ECN_PERMIT; + TCPSTAT_INC(tcps_ecn_shs); + } + /* + * Received in SYN_SENT[*] state. Transitions: + * SYN_SENT --> ESTABLISHED SYN_SENT* --> FIN_WAIT_1 + */ + tp->t_starttime = ticks; + if (tp->t_flags & TF_NEEDFIN) { + tcp_state_change(tp, TCPS_FIN_WAIT_1); + tp->t_flags &= ~TF_NEEDFIN; + thflags &= ~TH_SYN; + } else { + tcp_state_change(tp, TCPS_ESTABLISHED); + TCP_PROBE5(connect__established, NULL, tp, + mtod(m, const char *), tp, th); + cc_conn_init(tp); + } + } else { + /* + * Received initial SYN in SYN-SENT[*] state => simultaneous + * open. If segment contains CC option and there is a + * cached CC, apply TAO test. If it succeeds, connection is * + * half-synchronized. Otherwise, do 3-way handshake: + * SYN-SENT -> SYN-RECEIVED SYN-SENT* -> SYN-RECEIVED* If + * there was no CC option, clear cached CC value. + */ + tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN); + tcp_state_change(tp, TCPS_SYN_RECEIVED); + } + KASSERT(*ti_locked == TI_RLOCKED, ("%s: trimthenstep6: " + "ti_locked %d", __func__, *ti_locked)); + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); + INP_WLOCK_ASSERT(tp->t_inpcb); + /* + * Advance th->th_seq to correspond to first data byte. If data, + * trim to stay within window, dropping FIN if necessary. + */ + th->th_seq++; + if (tlen > tp->rcv_wnd) { + todrop = tlen - tp->rcv_wnd; + m_adj(m, -todrop); + tlen = tp->rcv_wnd; + thflags &= ~TH_FIN; + TCPSTAT_INC(tcps_rcvpackafterwin); + TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop); + } + tp->snd_wl1 = th->th_seq - 1; + tp->rcv_up = th->th_seq; + /* + * Client side of transaction: already sent SYN and data. If the + * remote host used T/TCP to validate the SYN, our data will be + * ACK'd; if so, enter normal data segment processing in the middle + * of step 5, ack processing. Otherwise, goto step 6. + */ + if (thflags & TH_ACK) { + if (rack_process_ack(m, th, so, tp, to, ti_locked, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) + return (ret_val); + /* We may have changed to FIN_WAIT_1 above */ + if (tp->t_state == TCPS_FIN_WAIT_1) { + /* + * In FIN_WAIT_1 STATE in addition to the processing + * for the ESTABLISHED state if our FIN is now + * acknowledged then enter FIN_WAIT_2. + */ + if (ourfinisacked) { + /* + * If we can't receive any more data, then + * closing user can proceed. Starting the + * timer is contrary to the specification, + * but if we don't get a FIN we'll hang + * forever. + * + * XXXjl: we should release the tp also, and + * use a compressed state. + */ + if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { + soisdisconnected(so); + tcp_timer_activate(tp, TT_2MSL, + (tcp_fast_finwait2_recycle ? + tcp_finwait2_timeout : + TP_MAXIDLE(tp))); + } + tcp_state_change(tp, TCPS_FIN_WAIT_2); + } + } + } + return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, + ti_locked, tiwin, thflags, nxt_pkt)); +} + +/* + * Return value of 1, the TCB is unlocked and most + * likely gone, return value of 0, the TCP is still + * locked. + */ +static int +rack_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so, + struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, + int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) +{ + int32_t ret_val = 0; + int32_t ourfinisacked = 0; + + rack_calc_rwin(so, tp); + + if ((thflags & TH_ACK) && + (SEQ_LEQ(th->th_ack, tp->snd_una) || + SEQ_GT(th->th_ack, tp->snd_max))) { + rack_do_dropwithreset(m, tp, th, ti_locked, BANDLIM_RST_OPENPORT, tlen); + return (1); + } +#ifdef TCP_RFC7413 + if (tp->t_flags & TF_FASTOPEN) { + /* + * When a TFO connection is in SYN_RECEIVED, the only valid + * packets are the initial SYN, a retransmit/copy of the + * initial SYN (possibly with a subset of the original + * data), a valid ACK, a FIN, or a RST. + */ + if ((thflags & (TH_SYN | TH_ACK)) == (TH_SYN | TH_ACK)) { + rack_do_dropwithreset(m, tp, th, ti_locked, BANDLIM_RST_OPENPORT, tlen); + return (1); + } else if (thflags & TH_SYN) { + /* non-initial SYN is ignored */ + struct tcp_rack *rack; + + rack = (struct tcp_rack *)tp->t_fb_ptr; + if ((rack->r_ctl.rc_hpts_flags & PACE_TMR_RXT) || + (rack->r_ctl.rc_hpts_flags & PACE_TMR_TLP) || + (rack->r_ctl.rc_hpts_flags & PACE_TMR_RACK)) { + rack_do_drop(m, NULL, ti_locked); + return (0); + } + } else if (!(thflags & (TH_ACK | TH_FIN | TH_RST))) { + rack_do_drop(m, NULL, ti_locked); + return (0); + } + } +#endif + if (thflags & TH_RST) + return (rack_process_rst(m, th, so, tp, ti_locked)); + /* + * RFC5961 Section 4.2 Send challenge ACK for any SYN in + * synchronized state. + */ + if (thflags & TH_SYN) { + rack_challenge_ack(m, th, tp, ti_locked, &ret_val); + return (ret_val); + } + /* + * RFC 1323 PAWS: If we have a timestamp reply on this segment and + * it's less than ts_recent, drop it. + */ + if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && + TSTMP_LT(to->to_tsval, tp->ts_recent)) { + if (rack_ts_check(m, th, tp, ti_locked, tlen, thflags, &ret_val)) + return (ret_val); + } + /* + * In the SYN-RECEIVED state, validate that the packet belongs to + * this connection before trimming the data to fit the receive + * window. Check the sequence number versus IRS since we know the + * sequence numbers haven't wrapped. This is a partial fix for the + * "LAND" DoS attack. + */ + if (SEQ_LT(th->th_seq, tp->irs)) { + rack_do_dropwithreset(m, tp, th, ti_locked, BANDLIM_RST_OPENPORT, tlen); + return (1); + } + if (rack_drop_checks(to, m, th, tp, &tlen, ti_locked, &thflags, &drop_hdrlen, &ret_val)) { + return (ret_val); + } + /* + * If last ACK falls within this segment's sequence numbers, record + * its timestamp. NOTE: 1) That the test incorporates suggestions + * from the latest proposal of the tcplw@cray.com list (Braden + * 1993/04/26). 2) That updating only on newer timestamps interferes + * with our earlier PAWS tests, so this check should be solely + * predicated on the sequence space of this segment. 3) That we + * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ + * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + + * SEG.Len, This modified check allows us to overcome RFC1323's + * limitations as described in Stevens TCP/IP Illustrated Vol. 2 + * p.869. In such cases, we can still calculate the RTT correctly + * when RCV.NXT == Last.ACK.Sent. + */ + if ((to->to_flags & TOF_TS) != 0 && + SEQ_LEQ(th->th_seq, tp->last_ack_sent) && + SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + + ((thflags & (TH_SYN | TH_FIN)) != 0))) { + tp->ts_recent_age = tcp_ts_getticks(); + tp->ts_recent = to->to_tsval; + } + /* + * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag + * is on (half-synchronized state), then queue data for later + * processing; else drop segment and return. + */ + if ((thflags & TH_ACK) == 0) { +#ifdef TCP_RFC7413 + if (tp->t_flags & TF_FASTOPEN) { + tp->snd_wnd = tiwin; + cc_conn_init(tp); + } +#endif + return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, + ti_locked, tiwin, thflags, nxt_pkt)); + } + TCPSTAT_INC(tcps_connects); + soisconnected(so); + /* Do window scaling? */ + if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) == + (TF_RCVD_SCALE | TF_REQ_SCALE)) { + tp->rcv_scale = tp->request_r_scale; + tp->snd_wnd = tiwin; + } + /* + * Make transitions: SYN-RECEIVED -> ESTABLISHED SYN-RECEIVED* -> + * FIN-WAIT-1 + */ + tp->t_starttime = ticks; + if (tp->t_flags & TF_NEEDFIN) { + tcp_state_change(tp, TCPS_FIN_WAIT_1); + tp->t_flags &= ~TF_NEEDFIN; + } else { + tcp_state_change(tp, TCPS_ESTABLISHED); + TCP_PROBE5(accept__established, NULL, tp, + mtod(m, const char *), tp, th); +#ifdef TCP_RFC7413 + if (tp->t_tfo_pending) { + tcp_fastopen_decrement_counter(tp->t_tfo_pending); + tp->t_tfo_pending = NULL; + + /* + * Account for the ACK of our SYN prior to regular + * ACK processing below. + */ + tp->snd_una++; + } + /* + * TFO connections call cc_conn_init() during SYN + * processing. Calling it again here for such connections + * is not harmless as it would undo the snd_cwnd reduction + * that occurs when a TFO SYN|ACK is retransmitted. + */ + if (!(tp->t_flags & TF_FASTOPEN)) +#endif + cc_conn_init(tp); + } + /* + * If segment contains data or ACK, will call tcp_reass() later; if + * not, do so now to pass queued data to user. + */ + if (tlen == 0 && (thflags & TH_FIN) == 0) + (void)tcp_reass(tp, (struct tcphdr *)0, 0, + (struct mbuf *)0); + tp->snd_wl1 = th->th_seq - 1; + if (rack_process_ack(m, th, so, tp, to, ti_locked, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { + return (ret_val); + } + if (tp->t_state == TCPS_FIN_WAIT_1) { + /* We could have went to FIN_WAIT_1 (or EST) above */ + /* + * In FIN_WAIT_1 STATE in addition to the processing for the + * ESTABLISHED state if our FIN is now acknowledged then + * enter FIN_WAIT_2. + */ + if (ourfinisacked) { + /* + * If we can't receive any more data, then closing + * user can proceed. Starting the timer is contrary + * to the specification, but if we don't get a FIN + * we'll hang forever. + * + * XXXjl: we should release the tp also, and use a + * compressed state. + */ + if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { + soisdisconnected(so); + tcp_timer_activate(tp, TT_2MSL, + (tcp_fast_finwait2_recycle ? + tcp_finwait2_timeout : + TP_MAXIDLE(tp))); + } + tcp_state_change(tp, TCPS_FIN_WAIT_2); + } + } + return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, + ti_locked, tiwin, thflags, nxt_pkt)); +} + +/* + * Return value of 1, the TCB is unlocked and most + * likely gone, return value of 0, the TCP is still + * locked. + */ +static int +rack_do_established(struct mbuf *m, struct tcphdr *th, struct socket *so, + struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, + int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) +{ + int32_t ret_val = 0; + + /* + * Header prediction: check for the two common cases of a + * uni-directional data xfer. If the packet has no control flags, + * is in-sequence, the window didn't change and we're not + * retransmitting, it's a candidate. If the length is zero and the + * ack moved forward, we're the sender side of the xfer. Just free + * the data acked & wake any higher level process that was blocked + * waiting for space. If the length is non-zero and the ack didn't + * move, we're the receiver side. If we're getting packets in-order + * (the reassembly queue is empty), add the data toc The socket + * buffer and note that we need a delayed ack. Make sure that the + * hidden state-flags are also off. Since we check for + * TCPS_ESTABLISHED first, it can only be TH_NEEDSYN. + */ + if (__predict_true(((to->to_flags & TOF_SACK) == 0)) && + __predict_true((thflags & (TH_SYN | TH_FIN | TH_RST | TH_URG | TH_ACK)) == TH_ACK) && + __predict_true(LIST_EMPTY(&tp->t_segq)) && + __predict_true(th->th_seq == tp->rcv_nxt)) { + struct tcp_rack *rack; + + rack = (struct tcp_rack *)tp->t_fb_ptr; + if (tlen == 0) { + if (rack_fastack(m, th, so, tp, to, drop_hdrlen, tlen, + ti_locked, tiwin, nxt_pkt, rack->r_ctl.rc_rcvtime)) { + return (0); + } + } else { + if (rack_do_fastnewdata(m, th, so, tp, to, drop_hdrlen, tlen, + ti_locked, tiwin, nxt_pkt)) { + return (0); + } + } + } + rack_calc_rwin(so, tp); + + if (thflags & TH_RST) + return (rack_process_rst(m, th, so, tp, ti_locked)); + + /* + * RFC5961 Section 4.2 Send challenge ACK for any SYN in + * synchronized state. + */ + if (thflags & TH_SYN) { + rack_challenge_ack(m, th, tp, ti_locked, &ret_val); + return (ret_val); + } + /* + * RFC 1323 PAWS: If we have a timestamp reply on this segment and + * it's less than ts_recent, drop it. + */ + if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && + TSTMP_LT(to->to_tsval, tp->ts_recent)) { + if (rack_ts_check(m, th, tp, ti_locked, tlen, thflags, &ret_val)) + return (ret_val); + } + if (rack_drop_checks(to, m, th, tp, &tlen, ti_locked, &thflags, &drop_hdrlen, &ret_val)) { + return (ret_val); + } + /* + * If last ACK falls within this segment's sequence numbers, record + * its timestamp. NOTE: 1) That the test incorporates suggestions + * from the latest proposal of the tcplw@cray.com list (Braden + * 1993/04/26). 2) That updating only on newer timestamps interferes + * with our earlier PAWS tests, so this check should be solely + * predicated on the sequence space of this segment. 3) That we + * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ + * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + + * SEG.Len, This modified check allows us to overcome RFC1323's + * limitations as described in Stevens TCP/IP Illustrated Vol. 2 + * p.869. In such cases, we can still calculate the RTT correctly + * when RCV.NXT == Last.ACK.Sent. + */ + if ((to->to_flags & TOF_TS) != 0 && + SEQ_LEQ(th->th_seq, tp->last_ack_sent) && + SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + + ((thflags & (TH_SYN | TH_FIN)) != 0))) { + tp->ts_recent_age = tcp_ts_getticks(); + tp->ts_recent = to->to_tsval; + } + /* + * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag + * is on (half-synchronized state), then queue data for later + * processing; else drop segment and return. + */ + if ((thflags & TH_ACK) == 0) { + if (tp->t_flags & TF_NEEDSYN) { + + return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, + ti_locked, tiwin, thflags, nxt_pkt)); + + } else if (tp->t_flags & TF_ACKNOW) { + rack_do_dropafterack(m, tp, th, ti_locked, thflags, tlen, &ret_val); + return (ret_val); + } else { + rack_do_drop(m, NULL, ti_locked); + return (0); + } + } + /* + * Ack processing. + */ + if (rack_process_ack(m, th, so, tp, to, ti_locked, tiwin, tlen, NULL, thflags, &ret_val)) { + return (ret_val); + } + if (sbavail(&so->so_snd)) { + if (rack_progress_timeout_check(tp)) { + tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); + rack_do_dropwithreset(m, tp, th, ti_locked, BANDLIM_RST_OPENPORT, tlen); + return (1); + } + } + /* State changes only happen in rack_process_data() */ + return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, + ti_locked, tiwin, thflags, nxt_pkt)); +} + +/* + * Return value of 1, the TCB is unlocked and most + * likely gone, return value of 0, the TCP is still + * locked. + */ +static int +rack_do_close_wait(struct mbuf *m, struct tcphdr *th, struct socket *so, + struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, + int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) +{ + int32_t ret_val = 0; + + rack_calc_rwin(so, tp); + if (thflags & TH_RST) + return (rack_process_rst(m, th, so, tp, ti_locked)); + /* + * RFC5961 Section 4.2 Send challenge ACK for any SYN in + * synchronized state. + */ + if (thflags & TH_SYN) { + rack_challenge_ack(m, th, tp, ti_locked, &ret_val); + return (ret_val); + } + /* + * RFC 1323 PAWS: If we have a timestamp reply on this segment and + * it's less than ts_recent, drop it. + */ + if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && + TSTMP_LT(to->to_tsval, tp->ts_recent)) { + if (rack_ts_check(m, th, tp, ti_locked, tlen, thflags, &ret_val)) + return (ret_val); + } + if (rack_drop_checks(to, m, th, tp, &tlen, ti_locked, &thflags, &drop_hdrlen, &ret_val)) { + return (ret_val); + } + /* + * If last ACK falls within this segment's sequence numbers, record + * its timestamp. NOTE: 1) That the test incorporates suggestions + * from the latest proposal of the tcplw@cray.com list (Braden + * 1993/04/26). 2) That updating only on newer timestamps interferes + * with our earlier PAWS tests, so this check should be solely + * predicated on the sequence space of this segment. 3) That we + * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ + * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + + * SEG.Len, This modified check allows us to overcome RFC1323's + * limitations as described in Stevens TCP/IP Illustrated Vol. 2 + * p.869. In such cases, we can still calculate the RTT correctly + * when RCV.NXT == Last.ACK.Sent. + */ + if ((to->to_flags & TOF_TS) != 0 && + SEQ_LEQ(th->th_seq, tp->last_ack_sent) && + SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + + ((thflags & (TH_SYN | TH_FIN)) != 0))) { + tp->ts_recent_age = tcp_ts_getticks(); + tp->ts_recent = to->to_tsval; + } + /* + * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag + * is on (half-synchronized state), then queue data for later + * processing; else drop segment and return. + */ + if ((thflags & TH_ACK) == 0) { + if (tp->t_flags & TF_NEEDSYN) { + return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, + ti_locked, tiwin, thflags, nxt_pkt)); + + } else if (tp->t_flags & TF_ACKNOW) { + rack_do_dropafterack(m, tp, th, ti_locked, thflags, tlen, &ret_val); + return (ret_val); + } else { + rack_do_drop(m, NULL, ti_locked); + return (0); + } + } + /* + * Ack processing. + */ + if (rack_process_ack(m, th, so, tp, to, ti_locked, tiwin, tlen, NULL, thflags, &ret_val)) { + return (ret_val); + } + if (sbavail(&so->so_snd)) { + if (rack_progress_timeout_check(tp)) { + tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); + rack_do_dropwithreset(m, tp, th, ti_locked, BANDLIM_RST_OPENPORT, tlen); + return (1); + } + } + return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, + ti_locked, tiwin, thflags, nxt_pkt)); +} + +static int +rack_check_data_after_close(struct mbuf *m, + struct tcpcb *tp, int32_t *ti_locked, int32_t *tlen, struct tcphdr *th, struct socket *so) +{ + struct tcp_rack *rack; + + KASSERT(*ti_locked == TI_RLOCKED, ("%s: SS_NOFDEREF && " + "CLOSE_WAIT && tlen ti_locked %d", __func__, *ti_locked)); + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); + rack = (struct tcp_rack *)tp->t_fb_ptr; + if (rack->rc_allow_data_af_clo == 0) { + close_now: + tp = tcp_close(tp); + TCPSTAT_INC(tcps_rcvafterclose); + rack_do_dropwithreset(m, tp, th, ti_locked, BANDLIM_UNLIMITED, (*tlen)); + return (1); + } + if (sbavail(&so->so_snd) == 0) + goto close_now; + /* Ok we allow data that is ignored and a followup reset */ + tp->rcv_nxt = th->th_seq + *tlen; + tp->t_flags2 |= TF2_DROP_AF_DATA; + rack->r_wanted_output = 1; + *tlen = 0; + return (0); +} + +/* + * Return value of 1, the TCB is unlocked and most + * likely gone, return value of 0, the TCP is still + * locked. + */ +static int +rack_do_fin_wait_1(struct mbuf *m, struct tcphdr *th, struct socket *so, + struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, + int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) +{ + int32_t ret_val = 0; + int32_t ourfinisacked = 0; + + rack_calc_rwin(so, tp); + + if (thflags & TH_RST) + return (rack_process_rst(m, th, so, tp, ti_locked)); + /* + * RFC5961 Section 4.2 Send challenge ACK for any SYN in + * synchronized state. + */ + if (thflags & TH_SYN) { + rack_challenge_ack(m, th, tp, ti_locked, &ret_val); + return (ret_val); + } + /* + * RFC 1323 PAWS: If we have a timestamp reply on this segment and + * it's less than ts_recent, drop it. + */ + if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && + TSTMP_LT(to->to_tsval, tp->ts_recent)) { + if (rack_ts_check(m, th, tp, ti_locked, tlen, thflags, &ret_val)) + return (ret_val); + } + if (rack_drop_checks(to, m, th, tp, &tlen, ti_locked, &thflags, &drop_hdrlen, &ret_val)) { + return (ret_val); + } + /* + * If new data are received on a connection after the user processes + * are gone, then RST the other end. + */ + if ((so->so_state & SS_NOFDREF) && tlen) { + if (rack_check_data_after_close(m, tp, ti_locked, &tlen, th, so)) + return (1); + } + /* + * If last ACK falls within this segment's sequence numbers, record + * its timestamp. NOTE: 1) That the test incorporates suggestions + * from the latest proposal of the tcplw@cray.com list (Braden + * 1993/04/26). 2) That updating only on newer timestamps interferes + * with our earlier PAWS tests, so this check should be solely + * predicated on the sequence space of this segment. 3) That we + * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ + * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + + * SEG.Len, This modified check allows us to overcome RFC1323's + * limitations as described in Stevens TCP/IP Illustrated Vol. 2 + * p.869. In such cases, we can still calculate the RTT correctly + * when RCV.NXT == Last.ACK.Sent. + */ + if ((to->to_flags & TOF_TS) != 0 && + SEQ_LEQ(th->th_seq, tp->last_ack_sent) && + SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + + ((thflags & (TH_SYN | TH_FIN)) != 0))) { + tp->ts_recent_age = tcp_ts_getticks(); + tp->ts_recent = to->to_tsval; + } + /* + * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag + * is on (half-synchronized state), then queue data for later + * processing; else drop segment and return. + */ + if ((thflags & TH_ACK) == 0) { + if (tp->t_flags & TF_NEEDSYN) { + return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, + ti_locked, tiwin, thflags, nxt_pkt)); + } else if (tp->t_flags & TF_ACKNOW) { + rack_do_dropafterack(m, tp, th, ti_locked, thflags, tlen, &ret_val); + return (ret_val); + } else { + rack_do_drop(m, NULL, ti_locked); + return (0); + } + } + /* + * Ack processing. + */ + if (rack_process_ack(m, th, so, tp, to, ti_locked, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { + return (ret_val); + } + if (ourfinisacked) { + /* + * If we can't receive any more data, then closing user can + * proceed. Starting the timer is contrary to the + * specification, but if we don't get a FIN we'll hang + * forever. + * + * XXXjl: we should release the tp also, and use a + * compressed state. + */ + if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { + soisdisconnected(so); + tcp_timer_activate(tp, TT_2MSL, + (tcp_fast_finwait2_recycle ? + tcp_finwait2_timeout : + TP_MAXIDLE(tp))); + } + tcp_state_change(tp, TCPS_FIN_WAIT_2); + } + if (sbavail(&so->so_snd)) { + if (rack_progress_timeout_check(tp)) { + tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); + rack_do_dropwithreset(m, tp, th, ti_locked, BANDLIM_RST_OPENPORT, tlen); + return (1); + } + } + return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, + ti_locked, tiwin, thflags, nxt_pkt)); +} + +/* + * Return value of 1, the TCB is unlocked and most + * likely gone, return value of 0, the TCP is still + * locked. + */ +static int +rack_do_closing(struct mbuf *m, struct tcphdr *th, struct socket *so, + struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, + int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) +{ + int32_t ret_val = 0; + int32_t ourfinisacked = 0; + + rack_calc_rwin(so, tp); + + if (thflags & TH_RST) + return (rack_process_rst(m, th, so, tp, ti_locked)); + /* + * RFC5961 Section 4.2 Send challenge ACK for any SYN in + * synchronized state. + */ + if (thflags & TH_SYN) { + rack_challenge_ack(m, th, tp, ti_locked, &ret_val); + return (ret_val); + } + /* + * RFC 1323 PAWS: If we have a timestamp reply on this segment and + * it's less than ts_recent, drop it. + */ + if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && + TSTMP_LT(to->to_tsval, tp->ts_recent)) { + if (rack_ts_check(m, th, tp, ti_locked, tlen, thflags, &ret_val)) + return (ret_val); + } + if (rack_drop_checks(to, m, th, tp, &tlen, ti_locked, &thflags, &drop_hdrlen, &ret_val)) { + return (ret_val); + } + /* + * If new data are received on a connection after the user processes + * are gone, then RST the other end. + */ + if ((so->so_state & SS_NOFDREF) && tlen) { + if (rack_check_data_after_close(m, tp, ti_locked, &tlen, th, so)) + return (1); + } + /* + * If last ACK falls within this segment's sequence numbers, record + * its timestamp. NOTE: 1) That the test incorporates suggestions + * from the latest proposal of the tcplw@cray.com list (Braden + * 1993/04/26). 2) That updating only on newer timestamps interferes + * with our earlier PAWS tests, so this check should be solely + * predicated on the sequence space of this segment. 3) That we + * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ + * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + + * SEG.Len, This modified check allows us to overcome RFC1323's + * limitations as described in Stevens TCP/IP Illustrated Vol. 2 + * p.869. In such cases, we can still calculate the RTT correctly + * when RCV.NXT == Last.ACK.Sent. + */ + if ((to->to_flags & TOF_TS) != 0 && + SEQ_LEQ(th->th_seq, tp->last_ack_sent) && + SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + + ((thflags & (TH_SYN | TH_FIN)) != 0))) { + tp->ts_recent_age = tcp_ts_getticks(); + tp->ts_recent = to->to_tsval; + } + /* + * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag + * is on (half-synchronized state), then queue data for later + * processing; else drop segment and return. + */ + if ((thflags & TH_ACK) == 0) { + if (tp->t_flags & TF_NEEDSYN) { + return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, + ti_locked, tiwin, thflags, nxt_pkt)); + } else if (tp->t_flags & TF_ACKNOW) { + rack_do_dropafterack(m, tp, th, ti_locked, thflags, tlen, &ret_val); + return (ret_val); + } else { + rack_do_drop(m, NULL, ti_locked); + return (0); + } + } + /* + * Ack processing. + */ + if (rack_process_ack(m, th, so, tp, to, ti_locked, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { + return (ret_val); + } + if (ourfinisacked) { + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); + tcp_twstart(tp); + INP_INFO_RUNLOCK(&V_tcbinfo); + *ti_locked = TI_UNLOCKED; + m_freem(m); + return (1); + } + if (sbavail(&so->so_snd)) { + if (rack_progress_timeout_check(tp)) { + tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); + rack_do_dropwithreset(m, tp, th, ti_locked, BANDLIM_RST_OPENPORT, tlen); + return (1); + } + } + return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, + ti_locked, tiwin, thflags, nxt_pkt)); +} + +/* + * Return value of 1, the TCB is unlocked and most + * likely gone, return value of 0, the TCP is still + * locked. + */ +static int +rack_do_lastack(struct mbuf *m, struct tcphdr *th, struct socket *so, + struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, + int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) +{ + int32_t ret_val = 0; + int32_t ourfinisacked = 0; + + rack_calc_rwin(so, tp); + + if (thflags & TH_RST) + return (rack_process_rst(m, th, so, tp, ti_locked)); + /* + * RFC5961 Section 4.2 Send challenge ACK for any SYN in + * synchronized state. + */ + if (thflags & TH_SYN) { + rack_challenge_ack(m, th, tp, ti_locked, &ret_val); + return (ret_val); + } + /* + * RFC 1323 PAWS: If we have a timestamp reply on this segment and + * it's less than ts_recent, drop it. + */ + if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && + TSTMP_LT(to->to_tsval, tp->ts_recent)) { + if (rack_ts_check(m, th, tp, ti_locked, tlen, thflags, &ret_val)) + return (ret_val); + } + if (rack_drop_checks(to, m, th, tp, &tlen, ti_locked, &thflags, &drop_hdrlen, &ret_val)) { + return (ret_val); + } + /* + * If new data are received on a connection after the user processes + * are gone, then RST the other end. + */ + if ((so->so_state & SS_NOFDREF) && tlen) { + if (rack_check_data_after_close(m, tp, ti_locked, &tlen, th, so)) + return (1); + } + /* + * If last ACK falls within this segment's sequence numbers, record + * its timestamp. NOTE: 1) That the test incorporates suggestions + * from the latest proposal of the tcplw@cray.com list (Braden + * 1993/04/26). 2) That updating only on newer timestamps interferes + * with our earlier PAWS tests, so this check should be solely + * predicated on the sequence space of this segment. 3) That we + * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ + * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + + * SEG.Len, This modified check allows us to overcome RFC1323's + * limitations as described in Stevens TCP/IP Illustrated Vol. 2 + * p.869. In such cases, we can still calculate the RTT correctly + * when RCV.NXT == Last.ACK.Sent. + */ + if ((to->to_flags & TOF_TS) != 0 && + SEQ_LEQ(th->th_seq, tp->last_ack_sent) && + SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + + ((thflags & (TH_SYN | TH_FIN)) != 0))) { + tp->ts_recent_age = tcp_ts_getticks(); + tp->ts_recent = to->to_tsval; + } + /* + * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag + * is on (half-synchronized state), then queue data for later + * processing; else drop segment and return. + */ + if ((thflags & TH_ACK) == 0) { + if (tp->t_flags & TF_NEEDSYN) { + return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, + ti_locked, tiwin, thflags, nxt_pkt)); + } else if (tp->t_flags & TF_ACKNOW) { + rack_do_dropafterack(m, tp, th, ti_locked, thflags, tlen, &ret_val); + return (ret_val); + } else { + rack_do_drop(m, NULL, ti_locked); + return (0); + } + } + /* + * case TCPS_LAST_ACK: Ack processing. + */ + if (rack_process_ack(m, th, so, tp, to, ti_locked, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { + return (ret_val); + } + if (ourfinisacked) { + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); + tp = tcp_close(tp); + rack_do_drop(m, tp, ti_locked); + return (1); + } + if (sbavail(&so->so_snd)) { + if (rack_progress_timeout_check(tp)) { + tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); + rack_do_dropwithreset(m, tp, th, ti_locked, BANDLIM_RST_OPENPORT, tlen); + return (1); + } + } + return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, + ti_locked, tiwin, thflags, nxt_pkt)); +} + + +/* + * Return value of 1, the TCB is unlocked and most + * likely gone, return value of 0, the TCP is still + * locked. + */ +static int +rack_do_fin_wait_2(struct mbuf *m, struct tcphdr *th, struct socket *so, + struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, + int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) +{ + int32_t ret_val = 0; + int32_t ourfinisacked = 0; + + rack_calc_rwin(so, tp); + + /* Reset receive buffer auto scaling when not in bulk receive mode. */ + if (thflags & TH_RST) + return (rack_process_rst(m, th, so, tp, ti_locked)); + /* + * RFC5961 Section 4.2 Send challenge ACK for any SYN in + * synchronized state. + */ + if (thflags & TH_SYN) { + rack_challenge_ack(m, th, tp, ti_locked, &ret_val); + return (ret_val); + } + /* + * RFC 1323 PAWS: If we have a timestamp reply on this segment and + * it's less than ts_recent, drop it. + */ + if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && + TSTMP_LT(to->to_tsval, tp->ts_recent)) { + if (rack_ts_check(m, th, tp, ti_locked, tlen, thflags, &ret_val)) + return (ret_val); + } + if (rack_drop_checks(to, m, th, tp, &tlen, ti_locked, &thflags, &drop_hdrlen, &ret_val)) { + return (ret_val); + } + /* + * If new data are received on a connection after the user processes + * are gone, then RST the other end. + */ + if ((so->so_state & SS_NOFDREF) && + tlen) { + if (rack_check_data_after_close(m, tp, ti_locked, &tlen, th, so)) + return (1); + } + /* + * If last ACK falls within this segment's sequence numbers, record + * its timestamp. NOTE: 1) That the test incorporates suggestions + * from the latest proposal of the tcplw@cray.com list (Braden + * 1993/04/26). 2) That updating only on newer timestamps interferes + * with our earlier PAWS tests, so this check should be solely + * predicated on the sequence space of this segment. 3) That we + * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ + * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + + * SEG.Len, This modified check allows us to overcome RFC1323's + * limitations as described in Stevens TCP/IP Illustrated Vol. 2 + * p.869. In such cases, we can still calculate the RTT correctly + * when RCV.NXT == Last.ACK.Sent. + */ + if ((to->to_flags & TOF_TS) != 0 && + SEQ_LEQ(th->th_seq, tp->last_ack_sent) && + SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + + ((thflags & (TH_SYN | TH_FIN)) != 0))) { + tp->ts_recent_age = tcp_ts_getticks(); + tp->ts_recent = to->to_tsval; + } + /* + * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag + * is on (half-synchronized state), then queue data for later + * processing; else drop segment and return. + */ + if ((thflags & TH_ACK) == 0) { + if (tp->t_flags & TF_NEEDSYN) { + return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, + ti_locked, tiwin, thflags, nxt_pkt)); + } else if (tp->t_flags & TF_ACKNOW) { + rack_do_dropafterack(m, tp, th, ti_locked, thflags, tlen, &ret_val); + return (ret_val); + } else { + rack_do_drop(m, NULL, ti_locked); + return (0); + } + } + /* + * Ack processing. + */ + if (rack_process_ack(m, th, so, tp, to, ti_locked, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { + return (ret_val); + } + if (sbavail(&so->so_snd)) { + if (rack_progress_timeout_check(tp)) { + tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); + rack_do_dropwithreset(m, tp, th, ti_locked, BANDLIM_RST_OPENPORT, tlen); + return (1); + } + } + return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, + ti_locked, tiwin, thflags, nxt_pkt)); +} + + +static void inline +rack_clear_rate_sample(struct tcp_rack *rack) +{ + rack->r_ctl.rack_rs.rs_flags = RACK_RTT_EMPTY; + rack->r_ctl.rack_rs.rs_rtt_cnt = 0; + rack->r_ctl.rack_rs.rs_rtt_tot = 0; +} + +static int +rack_init(struct tcpcb *tp) +{ + struct tcp_rack *rack = NULL; + + tp->t_fb_ptr = uma_zalloc(rack_pcb_zone, M_NOWAIT); + if (tp->t_fb_ptr == NULL) { + /* + * We need to allocate memory but cant. The INP and INP_INFO + * locks and they are recusive (happens during setup. So a + * scheme to drop the locks fails :( + * + */ + return (ENOMEM); + } + memset(tp->t_fb_ptr, 0, sizeof(struct tcp_rack)); + + rack = (struct tcp_rack *)tp->t_fb_ptr; + TAILQ_INIT(&rack->r_ctl.rc_map); + TAILQ_INIT(&rack->r_ctl.rc_free); + TAILQ_INIT(&rack->r_ctl.rc_tmap); + rack->rc_tp = tp; + if (tp->t_inpcb) { + rack->rc_inp = tp->t_inpcb; + } + /* Probably not needed but lets be sure */ + rack_clear_rate_sample(rack); + rack->r_cpu = 0; + rack->r_ctl.rc_reorder_fade = rack_reorder_fade; + rack->rc_allow_data_af_clo = rack_ignore_data_after_close; + rack->r_ctl.rc_tlp_threshold = rack_tlp_thresh; + rack->rc_pace_reduce = rack_slot_reduction; + if (V_tcp_delack_enabled) + tp->t_delayed_ack = 1; + else + tp->t_delayed_ack = 0; + rack->rc_pace_max_segs = rack_hptsi_segments; + rack->r_ctl.rc_early_recovery_segs = rack_early_recovery_max_seg; + rack->r_ctl.rc_reorder_shift = rack_reorder_thresh; + rack->r_ctl.rc_pkt_delay = rack_pkt_delay; + rack->r_ctl.rc_prop_reduce = rack_use_proportional_reduce; + rack->r_idle_reduce_largest = rack_reduce_largest_on_idle; + rack->r_enforce_min_pace = rack_min_pace_time; + rack->r_min_pace_seg_thresh = rack_min_pace_time_seg_req; + rack->r_ctl.rc_prop_rate = rack_proportional_rate; + rack->r_ctl.rc_tlp_cwnd_reduce = rack_lower_cwnd_at_tlp; + rack->r_ctl.rc_early_recovery = rack_early_recovery; + rack->rc_always_pace = rack_pace_every_seg; + rack->r_ctl.rc_rate_sample_method = rack_rate_sample_method; + rack->rack_tlp_threshold_use = rack_tlp_threshold_use; + rack->r_ctl.rc_prr_sendalot = rack_send_a_lot_in_prr; + rack->r_ctl.rc_min_to = rack_min_to; + rack->r_ctl.rc_prr_inc_var = rack_inc_var; + rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), __LINE__, 0, 0, 0); + if (tp->snd_una != tp->snd_max) { + /* Create a send map for the current outstanding data */ + struct rack_sendmap *rsm; + + rsm = rack_alloc(rack); + if (rsm == NULL) { + uma_zfree(rack_pcb_zone, tp->t_fb_ptr); + tp->t_fb_ptr = NULL; + return (ENOMEM); + } + rsm->r_flags = RACK_OVERMAX; + rsm->r_tim_lastsent[0] = tcp_ts_getticks(); + rsm->r_rtr_cnt = 1; + rsm->r_rtr_bytes = 0; + rsm->r_start = tp->snd_una; + rsm->r_end = tp->snd_max; + rsm->r_sndcnt = 0; + TAILQ_INSERT_TAIL(&rack->r_ctl.rc_map, rsm, r_next); + TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext); + rsm->r_in_tmap = 1; + } + return (0); +} + +static int +rack_handoff_ok(struct tcpcb *tp) +{ + if ((tp->t_state == TCPS_CLOSED) || + (tp->t_state == TCPS_LISTEN)) { + /* Sure no problem though it may not stick */ + return (0); + } + if ((tp->t_state == TCPS_SYN_SENT) || + (tp->t_state == TCPS_SYN_RECEIVED)) { + /* + * We really don't know you have to get to ESTAB or beyond + * to tell. + */ + return (EAGAIN); + } + if (tp->t_flags & TF_SACK_PERMIT) { + return (0); + } + /* + * If we reach here we don't do SACK on this connection so we can + * never do rack. + */ + return (EINVAL); +} + +static void +rack_fini(struct tcpcb *tp, int32_t tcb_is_purged) +{ + if (tp->t_fb_ptr) { + struct tcp_rack *rack; + struct rack_sendmap *rsm; + + rack = (struct tcp_rack *)tp->t_fb_ptr; +#ifdef TCP_BLACKBOX + tcp_log_flowend(tp); +#endif + rsm = TAILQ_FIRST(&rack->r_ctl.rc_map); + while (rsm) { + TAILQ_REMOVE(&rack->r_ctl.rc_map, rsm, r_next); + uma_zfree(rack_zone, rsm); + rsm = TAILQ_FIRST(&rack->r_ctl.rc_map); + } + rsm = TAILQ_FIRST(&rack->r_ctl.rc_free); + while (rsm) { + TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_next); + uma_zfree(rack_zone, rsm); + rsm = TAILQ_FIRST(&rack->r_ctl.rc_free); + } + rack->rc_free_cnt = 0; + uma_zfree(rack_pcb_zone, tp->t_fb_ptr); + tp->t_fb_ptr = NULL; + } +} + +static void +rack_set_state(struct tcpcb *tp, struct tcp_rack *rack) +{ + switch (tp->t_state) { + case TCPS_SYN_SENT: + rack->r_state = TCPS_SYN_SENT; + rack->r_substate = rack_do_syn_sent; + break; + case TCPS_SYN_RECEIVED: + rack->r_state = TCPS_SYN_RECEIVED; + rack->r_substate = rack_do_syn_recv; + break; + case TCPS_ESTABLISHED: + rack->r_state = TCPS_ESTABLISHED; + rack->r_substate = rack_do_established; + break; + case TCPS_CLOSE_WAIT: + rack->r_state = TCPS_CLOSE_WAIT; + rack->r_substate = rack_do_close_wait; + break; + case TCPS_FIN_WAIT_1: + rack->r_state = TCPS_FIN_WAIT_1; + rack->r_substate = rack_do_fin_wait_1; + break; + case TCPS_CLOSING: + rack->r_state = TCPS_CLOSING; + rack->r_substate = rack_do_closing; + break; + case TCPS_LAST_ACK: + rack->r_state = TCPS_LAST_ACK; + rack->r_substate = rack_do_lastack; + break; + case TCPS_FIN_WAIT_2: + rack->r_state = TCPS_FIN_WAIT_2; + rack->r_substate = rack_do_fin_wait_2; + break; + case TCPS_LISTEN: + case TCPS_CLOSED: + case TCPS_TIME_WAIT: + default: +#ifdef INVARIANTS + panic("tcp tp:%p state:%d sees impossible state?", tp, tp->t_state); +#endif + break; + }; +} + + +static void +rack_timer_audit(struct tcpcb *tp, struct tcp_rack *rack, struct sockbuf *sb) +{ + /* + * We received an ack, and then did not + * call send or were bounced out due to the + * hpts was running. Now a timer is up as well, is + * it the right timer? + */ + struct rack_sendmap *rsm; + int tmr_up; + + tmr_up = rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK; + if (rack->rc_in_persist && (tmr_up == PACE_TMR_PERSIT)) + return; + rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); + if (((rsm == NULL) || (tp->t_state < TCPS_ESTABLISHED)) && + (tmr_up == PACE_TMR_RXT)) { + /* Should be an RXT */ + return; + } + if (rsm == NULL) { + /* Nothing outstanding? */ + if (tp->t_flags & TF_DELACK) { + if (tmr_up == PACE_TMR_DELACK) + /* We are supposed to have delayed ack up and we do */ + return; + } else if (sbavail(&tp->t_inpcb->inp_socket->so_snd) && (tmr_up == PACE_TMR_RXT)) { + /* + * if we hit enobufs then we would expect the possiblity + * of nothing outstanding and the RXT up (and the hptsi timer). + */ + return; + } else if (((tcp_always_keepalive || + rack->rc_inp->inp_socket->so_options & SO_KEEPALIVE) && + (tp->t_state <= TCPS_CLOSING)) && + (tmr_up == PACE_TMR_KEEP) && + (tp->snd_max == tp->snd_una)) { + /* We should have keep alive up and we do */ + return; + } + } + if (rsm && (rsm->r_flags & RACK_SACK_PASSED)) { + if ((tp->t_flags & TF_SENTFIN) && + ((tp->snd_max - tp->snd_una) == 1) && + (rsm->r_flags & RACK_HAS_FIN)) { + /* needs to be a RXT */ + if (tmr_up == PACE_TMR_RXT) + return; + } else if (tmr_up == PACE_TMR_RACK) + return; + } else if (SEQ_GT(tp->snd_max,tp->snd_una) && + ((tmr_up == PACE_TMR_TLP) || + (tmr_up == PACE_TMR_RXT))) { + /* + * Either a TLP or RXT is fine if no sack-passed + * is in place and data is outstanding. + */ + return; + } else if (tmr_up == PACE_TMR_DELACK) { + /* + * If the delayed ack was going to go off + * before the rtx/tlp/rack timer were going to + * expire, then that would be the timer in control. + * Note we don't check the time here trusting the + * code is correct. + */ + return; + } + /* + * Ok the timer originally started is not what we want now. + * We will force the hpts to be stopped if any, and restart + * with the slot set to what was in the saved slot. + */ + rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); + rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), __LINE__, 0, 0, 0); +} + +static void +rack_hpts_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, + struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, uint8_t iptos, + int32_t ti_locked, int32_t nxt_pkt, struct timeval *tv) +{ + int32_t thflags, retval, did_out = 0; + int32_t way_out = 0; + uint32_t cts; + uint32_t tiwin; + struct tcpopt to; + struct tcp_rack *rack; + struct rack_sendmap *rsm; + int32_t prev_state = 0; + + cts = tcp_tv_to_mssectick(tv); + rack = (struct tcp_rack *)tp->t_fb_ptr; + + kern_prefetch(rack, &prev_state); + prev_state = 0; + thflags = th->th_flags; + /* + * If this is either a state-changing packet or current state isn't + * established, we require a read lock on tcbinfo. Otherwise, we + * allow the tcbinfo to be in either locked or unlocked, as the + * caller may have unnecessarily acquired a lock due to a race. + */ + if ((thflags & (TH_SYN | TH_FIN | TH_RST)) != 0 || + tp->t_state != TCPS_ESTABLISHED) { + KASSERT(ti_locked == TI_RLOCKED, ("%s ti_locked %d for " + "SYN/FIN/RST/!EST", __func__, ti_locked)); + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); + } else { +#ifdef INVARIANTS + if (ti_locked == TI_RLOCKED) { + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); + } else { + KASSERT(ti_locked == TI_UNLOCKED, ("%s: EST " + "ti_locked: %d", __func__, ti_locked)); + INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); + } +#endif + } + INP_WLOCK_ASSERT(tp->t_inpcb); + KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN", + __func__)); + KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT", + __func__)); + { + union tcp_log_stackspecific log; + + memset(&log.u_bbr, 0, sizeof(log.u_bbr)); + log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; + log.u_bbr.ininput = rack->rc_inp->inp_in_input; + TCP_LOG_EVENT(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_IN, 0, + tlen, &log, true); + } + /* + * Segment received on connection. Reset idle time and keep-alive + * timer. XXX: This should be done after segment validation to + * ignore broken/spoofed segs. + */ + if (tp->t_idle_reduce && (tp->snd_max == tp->snd_una)) { +#ifdef NETFLIX_CWV + if ((tp->cwv_enabled) && + ((tp->cwv_cwnd_valid == 0) && + TCPS_HAVEESTABLISHED(tp->t_state) && + (tp->snd_cwnd > tp->snd_cwv.init_cwnd))) { + tcp_newcwv_nvp_closedown(tp); + } else +#endif + if ((ticks - tp->t_rcvtime) >= tp->t_rxtcur) { + counter_u64_add(rack_input_idle_reduces, 1); + rack_cc_after_idle(tp, + (rack->r_idle_reduce_largest ? 1 :0)); + } + } + rack->r_ctl.rc_rcvtime = cts; + tp->t_rcvtime = ticks; + +#ifdef NETFLIX_CWV + if (tp->cwv_enabled) { + if ((tp->cwv_cwnd_valid == 0) && + TCPS_HAVEESTABLISHED(tp->t_state) && + (tp->snd_cwnd > tp->snd_cwv.init_cwnd)) + tcp_newcwv_nvp_closedown(tp); + } +#endif + /* + * Unscale the window into a 32-bit value. For the SYN_SENT state + * the scale is zero. + */ + tiwin = th->th_win << tp->snd_scale; +#ifdef NETFLIX_STATS + stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_FRWIN, tiwin); +#endif + /* + * TCP ECN processing. XXXJTL: If we ever use ECN, we need to move + * this to occur after we've validated the segment. + */ + if (tp->t_flags & TF_ECN_PERMIT) { + if (thflags & TH_CWR) + tp->t_flags &= ~TF_ECN_SND_ECE; + switch (iptos & IPTOS_ECN_MASK) { + case IPTOS_ECN_CE: + tp->t_flags |= TF_ECN_SND_ECE; + TCPSTAT_INC(tcps_ecn_ce); + break; + case IPTOS_ECN_ECT0: + TCPSTAT_INC(tcps_ecn_ect0); + break; + case IPTOS_ECN_ECT1: + TCPSTAT_INC(tcps_ecn_ect1); + break; + } + /* Congestion experienced. */ + if (thflags & TH_ECE) { + rack_cong_signal(tp, th, CC_ECN); + } + } + /* + * Parse options on any incoming segment. + */ + tcp_dooptions(&to, (u_char *)(th + 1), + (th->th_off << 2) - sizeof(struct tcphdr), + (thflags & TH_SYN) ? TO_SYN : 0); + + /* + * If echoed timestamp is later than the current time, fall back to + * non RFC1323 RTT calculation. Normalize timestamp if syncookies + * were used when this connection was established. + */ + if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) { + to.to_tsecr -= tp->ts_offset; + if (TSTMP_GT(to.to_tsecr, cts)) + to.to_tsecr = 0; + } + /* + * If its the first time in we need to take care of options and + * verify we can do SACK for rack! + */ + if (rack->r_state == 0) { + /* Should be init'd by rack_init() */ + KASSERT(rack->rc_inp != NULL, + ("%s: rack->rc_inp unexpectedly NULL", __func__)); + if (rack->rc_inp == NULL) { + rack->rc_inp = tp->t_inpcb; + } + + /* + * Process options only when we get SYN/ACK back. The SYN + * case for incoming connections is handled in tcp_syncache. + * According to RFC1323 the window field in a SYN (i.e., a + * or ) segment itself is never scaled. XXX + * this is traditional behavior, may need to be cleaned up. + */ + rack->r_cpu = inp_to_cpuid(tp->t_inpcb); + if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) { + if ((to.to_flags & TOF_SCALE) && + (tp->t_flags & TF_REQ_SCALE)) { + tp->t_flags |= TF_RCVD_SCALE; + tp->snd_scale = to.to_wscale; + } + /* + * Initial send window. It will be updated with the + * next incoming segment to the scaled value. + */ + tp->snd_wnd = th->th_win; + if (to.to_flags & TOF_TS) { + tp->t_flags |= TF_RCVD_TSTMP; + tp->ts_recent = to.to_tsval; + tp->ts_recent_age = cts; + } + if (to.to_flags & TOF_MSS) + tcp_mss(tp, to.to_mss); + if ((tp->t_flags & TF_SACK_PERMIT) && + (to.to_flags & TOF_SACKPERM) == 0) + tp->t_flags &= ~TF_SACK_PERMIT; + } + /* + * At this point we are at the initial call. Here we decide + * if we are doing RACK or not. We do this by seeing if + * TF_SACK_PERMIT is set, if not rack is *not* possible and + * we switch to the default code. + */ + if ((tp->t_flags & TF_SACK_PERMIT) == 0) { + tcp_switch_back_to_default(tp); + (*tp->t_fb->tfb_tcp_do_segment) (m, th, so, tp, drop_hdrlen, + tlen, iptos, ti_locked); + return; + } + /* Set the flag */ + rack->r_is_v6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0; + tcp_set_hpts(tp->t_inpcb); + rack_stop_all_timers(tp); + sack_filter_clear(&rack->r_ctl.rack_sf, th->th_ack); + } + /* + * This is the one exception case where we set the rack state + * always. All other times (timers etc) we must have a rack-state + * set (so we assure we have done the checks above for SACK). + */ + if (rack->r_state != tp->t_state) + rack_set_state(tp, rack); + if (SEQ_GT(th->th_ack, tp->snd_una) && (rsm = TAILQ_FIRST(&rack->r_ctl.rc_map)) != NULL) + kern_prefetch(rsm, &prev_state); + prev_state = rack->r_state; + rack->r_ctl.rc_tlp_send_cnt = 0; + rack_clear_rate_sample(rack); + retval = (*rack->r_substate) (m, th, so, + tp, &to, drop_hdrlen, + tlen, &ti_locked, tiwin, thflags, nxt_pkt); +#ifdef INVARIANTS + if ((retval == 0) && + (tp->t_inpcb == NULL)) { + panic("retval:%d tp:%p t_inpcb:NULL state:%d", + retval, tp, prev_state); + } +#endif + if (ti_locked != TI_UNLOCKED) { + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); + INP_INFO_RUNLOCK(&V_tcbinfo); + ti_locked = TI_UNLOCKED; + } + if (retval == 0) { + /* + * If retval is 1 the tcb is unlocked and most likely the tp + * is gone. + */ + INP_WLOCK_ASSERT(tp->t_inpcb); + tcp_rack_xmit_timer_commit(rack, tp); + if (((tp->snd_max - tp->snd_una) > tp->snd_wnd) && + (rack->rc_in_persist == 0)){ + /* + * The peer shrunk its window on us to the point + * where we have sent too much. The only thing + * we can do here is stop any timers and + * enter persist. We most likely lost the last + * bytes we sent but oh well, we will have to + * retransmit them after the peer is caught up. + */ + if (rack->rc_inp->inp_in_hpts) + tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT); + rack_timer_cancel(tp, rack, cts, __LINE__); + rack_enter_persist(tp, rack, cts); + rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), __LINE__, 0, 0, 0); + way_out = 3; + goto done_with_input; + } + if (nxt_pkt == 0) { + if (rack->r_wanted_output != 0) { + did_out = 1; + (void)tp->t_fb->tfb_tcp_output(tp); + } + rack_start_hpts_timer(rack, tp, cts, __LINE__, 0, 0, 0); + } + if (((rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) == 0) && + (SEQ_GT(tp->snd_max, tp->snd_una) || + (tp->t_flags & TF_DELACK) || + ((tcp_always_keepalive || rack->rc_inp->inp_socket->so_options & SO_KEEPALIVE) && + (tp->t_state <= TCPS_CLOSING)))) { + /* We could not send (probably in the hpts but stopped the timer earlier)? */ + if ((tp->snd_max == tp->snd_una) && + ((tp->t_flags & TF_DELACK) == 0) && + (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) { + /* keep alive not needed if we are hptsi output yet */ + ; + } else { + if (rack->rc_inp->inp_in_hpts) + tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT); + rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), __LINE__, 0, 0, 0); + } + way_out = 1; + } else { + /* Do we have the correct timer running? */ + rack_timer_audit(tp, rack, &so->so_snd); + way_out = 2; + } + done_with_input: + rack_log_doseg_done(rack, cts, nxt_pkt, did_out, way_out); + if (did_out) + rack->r_wanted_output = 0; +#ifdef INVARIANTS + if (tp->t_inpcb == NULL) { + panic("OP:%d retval:%d tp:%p t_inpcb:NULL state:%d", + did_out, + retval, tp, prev_state); + } +#endif + INP_WUNLOCK(tp->t_inpcb); + } +} + +void +rack_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, + struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, uint8_t iptos, + int32_t ti_locked) +{ + struct timeval tv; +#ifdef RSS + struct tcp_function_block *tfb; + struct tcp_rack *rack; + struct inpcb *inp; + + rack = (struct tcp_rack *)tp->t_fb_ptr; + if (rack->r_state == 0) { + /* + * Initial input (ACK to SYN-ACK etc)lets go ahead and get + * it processed + */ + if (ti_locked != TI_RLOCKED && INP_INFO_TRY_RLOCK(&V_tcbinfo)) + ti_locked = TI_RLOCKED; + if (ti_locked != TI_RLOCKED) { + inp = tp->t_inpcb; + tfb = tp->t_fb; + in_pcbref(inp); + INP_WUNLOCK(inp); + INP_INFO_RLOCK(&V_tcbinfo); + ti_locked = TI_RLOCKED; + INP_WLOCK(inp); + if (in_pcbrele_wlocked(inp)) + inp = NULL; + if (inp == NULL || (inp->inp_flags2 & INP_FREED) || + (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED))) { + /* The TCPCB went away. Free the packet. */ + INP_INFO_RUNLOCK(&V_tcbinfo); + if (inp) + INP_WUNLOCK(inp); + m_freem(m); + return; + } + /* If the stack changed, call the correct stack. */ + if (tp->t_fb != tfb) { + tp->t_fb->tfb_tcp_do_segment(m, th, so, tp, + drop_hdrlen, tlen, iptos, ti_locked); + return; + } + } + tcp_get_usecs(&tv); + rack_hpts_do_segment(m, th, so, tp, drop_hdrlen, + tlen, iptos, ti_locked, 0, &tv); + return; + } + if (ti_locked == TI_RLOCKED) + INP_INFO_RUNLOCK(&V_tcbinfo); + tcp_queue_to_input(tp, m, th, tlen, drop_hdrlen, iptos, (uint8_t) ti_locked); + INP_WUNLOCK(tp->t_inpcb); +#else + tcp_get_usecs(&tv); + rack_hpts_do_segment(m, th, so, tp, drop_hdrlen, + tlen, iptos, ti_locked, 0, &tv); +#endif +} + +struct rack_sendmap * +tcp_rack_output(struct tcpcb *tp, struct tcp_rack *rack, uint32_t tsused) +{ + struct rack_sendmap *rsm = NULL; + int32_t idx; + uint32_t srtt_cur, srtt = 0, thresh = 0, ts_low = 0; + + /* Return the next guy to be re-transmitted */ + if (TAILQ_EMPTY(&rack->r_ctl.rc_map)) { + return (NULL); + } + if (tp->t_flags & TF_SENTFIN) { + /* retran the end FIN? */ + return (NULL); + } + /* ok lets look at this one */ + rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); + if (rsm && ((rsm->r_flags & RACK_ACKED) == 0)) { + goto check_it; + } + rsm = rack_find_lowest_rsm(rack); + if (rsm == NULL) { + return (NULL); + } +check_it: + srtt_cur = tp->t_srtt >> TCP_RTT_SHIFT; + srtt = TICKS_2_MSEC(srtt_cur); + if (rack->rc_rack_rtt && (srtt > rack->rc_rack_rtt)) + srtt = rack->rc_rack_rtt; + if (rsm->r_flags & RACK_ACKED) { + return (NULL); + } + if ((rsm->r_flags & RACK_SACK_PASSED) == 0) { + /* Its not yet ready */ + return (NULL); + } + idx = rsm->r_rtr_cnt - 1; + ts_low = rsm->r_tim_lastsent[idx]; + thresh = rack_calc_thresh_rack(rack, srtt, tsused); + if (tsused <= ts_low) { + return (NULL); + } + if ((tsused - ts_low) >= thresh) { + return (rsm); + } + return (NULL); +} + +static int +rack_output(struct tcpcb *tp) +{ + struct socket *so; + uint32_t recwin, sendwin; + uint32_t sb_offset; + int32_t len, flags, error = 0; + struct mbuf *m; + struct mbuf *mb; + uint32_t if_hw_tsomaxsegcount = 0; + uint32_t if_hw_tsomaxsegsize; + long tot_len_this_send = 0; + struct ip *ip = NULL; +#ifdef TCPDEBUG + struct ipovly *ipov = NULL; +#endif + struct udphdr *udp = NULL; + struct tcp_rack *rack; + struct tcphdr *th; + uint8_t pass = 0; + u_char opt[TCP_MAXOLEN]; + unsigned ipoptlen, optlen, hdrlen, ulen=0; + uint32_t rack_seq; + +#if defined(IPSEC) || defined(IPSEC_SUPPORT) + unsigned ipsec_optlen = 0; + +#endif + int32_t idle, sendalot; + int32_t sub_from_prr = 0; + volatile int32_t sack_rxmit; + struct rack_sendmap *rsm = NULL; + int32_t tso, mtu, would_have_fin = 0; + struct tcpopt to; + int32_t slot = 0; + uint32_t cts; + uint8_t hpts_calling, doing_tlp = 0; + int32_t do_a_prefetch; + int32_t prefetch_rsm = 0; + int32_t prefetch_so_done = 0; + struct tcp_log_buffer *lgb = NULL; + struct inpcb *inp; + struct sockbuf *sb; +#ifdef INET6 + struct ip6_hdr *ip6 = NULL; + int32_t isipv6; +#endif + /* setup and take the cache hits here */ + rack = (struct tcp_rack *)tp->t_fb_ptr; + inp = rack->rc_inp; + so = inp->inp_socket; + sb = &so->so_snd; + kern_prefetch(sb, &do_a_prefetch); + do_a_prefetch = 1; + + INP_WLOCK_ASSERT(inp); +#ifdef TCP_OFFLOAD + if (tp->t_flags & TF_TOE) + return (tcp_offload_output(tp)); +#endif + +#ifdef TCP_RFC7413 + /* + * For TFO connections in SYN_RECEIVED, only allow the initial + * SYN|ACK and those sent by the retransmit timer. + */ + if ((tp->t_flags & TF_FASTOPEN) && + (tp->t_state == TCPS_SYN_RECEIVED) && + SEQ_GT(tp->snd_max, tp->snd_una) && /* inital SYN|ACK sent */ + (tp->snd_nxt != tp->snd_una)) /* not a retransmit */ + return (0); +#endif +#ifdef INET6 + if (rack->r_state) { + /* Use the cache line loaded if possible */ + isipv6 = rack->r_is_v6; + } else { + isipv6 = (inp->inp_vflag & INP_IPV6) != 0; + } +#endif + cts = tcp_ts_getticks(); + if (((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0) && + inp->inp_in_hpts) { + /* + * We are on the hpts for some timer but not hptsi output. + * Remove from the hpts unconditionally. + */ + rack_timer_cancel(tp, rack, cts, __LINE__); + } + /* Mark that we have called rack_output(). */ + if ((rack->r_timer_override) || + (tp->t_flags & TF_FORCEDATA) || + (tp->t_state < TCPS_ESTABLISHED)) { + if (tp->t_inpcb->inp_in_hpts) + tcp_hpts_remove(tp->t_inpcb, HPTS_REMOVE_OUTPUT); + } else if (tp->t_inpcb->inp_in_hpts) { + /* + * On the hpts you can't pass even if ACKNOW is on, we will + * when the hpts fires. + */ + counter_u64_add(rack_out_size[TCP_MSS_ACCT_INPACE], 1); + return (0); + } + hpts_calling = inp->inp_hpts_calls; + inp->inp_hpts_calls = 0; + if (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) { + if (rack_process_timers(tp, rack, cts, hpts_calling)) { + counter_u64_add(rack_out_size[TCP_MSS_ACCT_ATIMER], 1); + return (0); + } + } + rack->r_wanted_output = 0; + rack->r_timer_override = 0; + /* + * Determine length of data that should be transmitted, and flags + * that will be used. If there is some data or critical controls + * (SYN, RST) to send, then transmit; otherwise, investigate + * further. + */ + idle = (tp->t_flags & TF_LASTIDLE) || (tp->snd_max == tp->snd_una); +#ifdef NETFLIX_CWV + if (tp->cwv_enabled) { + if ((tp->cwv_cwnd_valid == 0) && + TCPS_HAVEESTABLISHED(tp->t_state) && + (tp->snd_cwnd > tp->snd_cwv.init_cwnd)) + tcp_newcwv_nvp_closedown(tp); + } else +#endif + if (tp->t_idle_reduce) { + if (idle && ((ticks - tp->t_rcvtime) >= tp->t_rxtcur)) + rack_cc_after_idle(tp, + (rack->r_idle_reduce_largest ? 1 :0)); + } + tp->t_flags &= ~TF_LASTIDLE; + if (idle) { + if (tp->t_flags & TF_MORETOCOME) { + tp->t_flags |= TF_LASTIDLE; + idle = 0; + } + } +again: + /* + * If we've recently taken a timeout, snd_max will be greater than + * snd_nxt. There may be SACK information that allows us to avoid + * resending already delivered data. Adjust snd_nxt accordingly. + */ + sendalot = 0; + cts = tcp_ts_getticks(); + tso = 0; + mtu = 0; + sb_offset = tp->snd_max - tp->snd_una; + sendwin = min(tp->snd_wnd, tp->snd_cwnd); + + flags = tcp_outflags[tp->t_state]; + /* + * Send any SACK-generated retransmissions. If we're explicitly + * trying to send out new data (when sendalot is 1), bypass this + * function. If we retransmit in fast recovery mode, decrement + * snd_cwnd, since we're replacing a (future) new transmission with + * a retransmission now, and we previously incremented snd_cwnd in + * tcp_input(). + */ + /* + * Still in sack recovery , reset rxmit flag to zero. + */ + while (rack->rc_free_cnt < rack_free_cache) { + rsm = rack_alloc(rack); + if (rsm == NULL) { + if (inp->inp_hpts_calls) + /* Retry in a ms */ + slot = 1; + goto just_return_nolock; + } + TAILQ_INSERT_TAIL(&rack->r_ctl.rc_free, rsm, r_next); + rack->rc_free_cnt++; + rsm = NULL; + } + if (inp->inp_hpts_calls) + inp->inp_hpts_calls = 0; + sack_rxmit = 0; + len = 0; + rsm = NULL; + if (flags & TH_RST) { + SOCKBUF_LOCK(sb); + goto send; + } + if (rack->r_ctl.rc_tlpsend) { + /* Tail loss probe */ + long cwin; + long tlen; + + doing_tlp = 1; + rsm = rack->r_ctl.rc_tlpsend; + rack->r_ctl.rc_tlpsend = NULL; + sack_rxmit = 1; + tlen = rsm->r_end - rsm->r_start; + if (tlen > tp->t_maxseg) + tlen = tp->t_maxseg; +#ifdef INVARIANTS + if (SEQ_GT(tp->snd_una, rsm->r_start)) { + panic("tp:%p rack:%p snd_una:%u rsm:%p r_start:%u", + tp, rack, tp->snd_una, rsm, rsm->r_start); + } +#endif + sb_offset = rsm->r_start - tp->snd_una; + cwin = min(tp->snd_wnd, tlen); + len = cwin; + } else if (rack->r_ctl.rc_resend) { + /* Retransmit timer */ + rsm = rack->r_ctl.rc_resend; + rack->r_ctl.rc_resend = NULL; + len = rsm->r_end - rsm->r_start; + sack_rxmit = 1; + sendalot = 0; + sb_offset = rsm->r_start - tp->snd_una; + if (len >= tp->t_maxseg) { + len = tp->t_maxseg; + } + KASSERT(sb_offset >= 0, ("%s: sack block to the left of una : %d", + __func__, sb_offset)); + } else if ((rack->rc_in_persist == 0) && + ((rsm = tcp_rack_output(tp, rack, cts)) != NULL)) { + long tlen; + + if ((!IN_RECOVERY(tp->t_flags)) && + ((tp->t_flags & (TF_WASFRECOVERY | TF_WASCRECOVERY)) == 0)) { + /* Enter recovery if not induced by a time-out */ + rack->r_ctl.rc_rsm_start = rsm->r_start; + rack->r_ctl.rc_cwnd_at = tp->snd_cwnd; + rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh; + rack_cong_signal(tp, NULL, CC_NDUPACK); + /* + * When we enter recovery we need to assure we send + * one packet. + */ + rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg; + } +#ifdef INVARIANTS + if (SEQ_LT(rsm->r_start, tp->snd_una)) { + panic("Huh, tp:%p rack:%p rsm:%p start:%u < snd_una:%u\n", + tp, rack, rsm, rsm->r_start, tp->snd_una); + } +#endif + tlen = rsm->r_end - rsm->r_start; + sb_offset = rsm->r_start - tp->snd_una; + if (tlen > rack->r_ctl.rc_prr_sndcnt) { + len = rack->r_ctl.rc_prr_sndcnt; + } else { + len = tlen; + } + if (len >= tp->t_maxseg) { + sendalot = 1; + len = tp->t_maxseg; + } else { + sendalot = 0; + if ((rack->rc_timer_up == 0) && + (len < tlen)) { + /* + * If its not a timer don't send a partial + * segment. + */ + len = 0; + goto just_return_nolock; + } + } + KASSERT(sb_offset >= 0, ("%s: sack block to the left of una : %d", + __func__, sb_offset)); + if (len > 0) { + sub_from_prr = 1; + sack_rxmit = 1; + TCPSTAT_INC(tcps_sack_rexmits); + TCPSTAT_ADD(tcps_sack_rexmit_bytes, + min(len, tp->t_maxseg)); + counter_u64_add(rack_rtm_prr_retran, 1); + } + } + if (rsm && (rsm->r_flags & RACK_HAS_FIN)) { + /* we are retransmitting the fin */ + len--; + if (len) { + /* + * When retransmitting data do *not* include the + * FIN. This could happen from a TLP probe. + */ + flags &= ~TH_FIN; + } + } +#ifdef INVARIANTS + /* For debugging */ + rack->r_ctl.rc_rsm_at_retran = rsm; +#endif + /* + * Get standard flags, and add SYN or FIN if requested by 'hidden' + * state flags. + */ + if (tp->t_flags & TF_NEEDFIN) + flags |= TH_FIN; + if (tp->t_flags & TF_NEEDSYN) + flags |= TH_SYN; + if ((sack_rxmit == 0) && (prefetch_rsm == 0)) { + void *end_rsm; + end_rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_tmap, rack_sendmap, r_tnext); + if (end_rsm) + kern_prefetch(end_rsm, &prefetch_rsm); + prefetch_rsm = 1; + } + SOCKBUF_LOCK(sb); + /* + * If in persist timeout with window of 0, send 1 byte. Otherwise, + * if window is small but nonzero and time TF_SENTFIN expired, we + * will send what we can and go to transmit state. + */ + if (tp->t_flags & TF_FORCEDATA) { + if (sendwin == 0) { + /* + * If we still have some data to send, then clear + * the FIN bit. Usually this would happen below + * when it realizes that we aren't sending all the + * data. However, if we have exactly 1 byte of + * unsent data, then it won't clear the FIN bit + * below, and if we are in persist state, we wind up + * sending the packet without recording that we sent + * the FIN bit. + * + * We can't just blindly clear the FIN bit, because + * if we don't have any more data to send then the + * probe will be the FIN itself. + */ + if (sb_offset < sbused(sb)) + flags &= ~TH_FIN; + sendwin = 1; + } else { + if (rack->rc_in_persist) + rack_exit_persist(tp, rack); + /* + * If we are dropping persist mode then we need to + * correct snd_nxt/snd_max and off. + */ + tp->snd_nxt = tp->snd_max; + sb_offset = tp->snd_nxt - tp->snd_una; + } + } + /* + * If snd_nxt == snd_max and we have transmitted a FIN, the + * sb_offset will be > 0 even if so_snd.sb_cc is 0, resulting in a + * negative length. This can also occur when TCP opens up its + * congestion window while receiving additional duplicate acks after + * fast-retransmit because TCP will reset snd_nxt to snd_max after + * the fast-retransmit. + * + * In the normal retransmit-FIN-only case, however, snd_nxt will be + * set to snd_una, the sb_offset will be 0, and the length may wind + * up 0. + * + * If sack_rxmit is true we are retransmitting from the scoreboard + * in which case len is already set. + */ + if (sack_rxmit == 0) { + uint32_t avail; + + avail = sbavail(sb); + if (SEQ_GT(tp->snd_nxt, tp->snd_una)) + sb_offset = tp->snd_nxt - tp->snd_una; + else + sb_offset = 0; + if (IN_RECOVERY(tp->t_flags) == 0) { + if (rack->r_ctl.rc_tlp_new_data) { + /* TLP is forcing out new data */ + if (rack->r_ctl.rc_tlp_new_data > (uint32_t) (avail - sb_offset)) { + rack->r_ctl.rc_tlp_new_data = (uint32_t) (avail - sb_offset); + } + if (rack->r_ctl.rc_tlp_new_data > tp->snd_wnd) + len = tp->snd_wnd; + else + len = rack->r_ctl.rc_tlp_new_data; + rack->r_ctl.rc_tlp_new_data = 0; + doing_tlp = 1; + } else { + if (sendwin > avail) { + /* use the available */ + if (avail > sb_offset) { + len = (int32_t)(avail - sb_offset); + } else { + len = 0; + } + } else { + if (sendwin > sb_offset) { + len = (int32_t)(sendwin - sb_offset); + } else { + len = 0; + } + } + } + } else { + uint32_t outstanding; + + /* + * We are inside of a SACK recovery episode and are + * sending new data, having retransmitted all the + * data possible so far in the scoreboard. + */ + outstanding = tp->snd_max - tp->snd_una; + if ((rack->r_ctl.rc_prr_sndcnt + outstanding) > tp->snd_wnd) + len = 0; + else if (avail > sb_offset) + len = avail - sb_offset; + else + len = 0; + if (len > 0) { + if (len > rack->r_ctl.rc_prr_sndcnt) + len = rack->r_ctl.rc_prr_sndcnt; + + if (len > 0) { + sub_from_prr = 1; + counter_u64_add(rack_rtm_prr_newdata, 1); + } + } + if (len > tp->t_maxseg) { + /* + * We should never send more than a MSS when + * retransmitting or sending new data in prr + * mode unless the override flag is on. Most + * likely the PRR algorithm is not going to + * let us send a lot as well :-) + */ + if (rack->r_ctl.rc_prr_sendalot == 0) + len = tp->t_maxseg; + } else if (len < tp->t_maxseg) { + /* + * Do we send any? The idea here is if the + * send empty's the socket buffer we want to + * do it. However if not then lets just wait + * for our prr_sndcnt to get bigger. + */ + long leftinsb; + + leftinsb = sbavail(sb) - sb_offset; + if (leftinsb > len) { + /* This send does not empty the sb */ + len = 0; + } + } + } + } + if (prefetch_so_done == 0) { + kern_prefetch(so, &prefetch_so_done); + prefetch_so_done = 1; + } + /* + * Lop off SYN bit if it has already been sent. However, if this is + * SYN-SENT state and if segment contains data and if we don't know + * that foreign host supports TAO, suppress sending segment. + */ + if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una)) { + if ((tp->t_state != TCPS_SYN_RECEIVED) && + (tp->t_state != TCPS_SYN_SENT)) + flags &= ~TH_SYN; +#ifdef TCP_RFC7413 + /* + * When sending additional segments following a TFO SYN|ACK, + * do not include the SYN bit. + */ + if ((tp->t_flags & TF_FASTOPEN) && + (tp->t_state == TCPS_SYN_RECEIVED)) + flags &= ~TH_SYN; +#endif + sb_offset--, len++; + if (sbavail(sb) == 0) + len = 0; + } + /* + * Be careful not to send data and/or FIN on SYN segments. This + * measure is needed to prevent interoperability problems with not + * fully conformant TCP implementations. + */ + if ((flags & TH_SYN) && (tp->t_flags & TF_NOOPT)) { + len = 0; + flags &= ~TH_FIN; + } +#ifdef TCP_RFC7413 + /* + * When retransmitting SYN|ACK on a passively-created TFO socket, + * don't include data, as the presence of data may have caused the + * original SYN|ACK to have been dropped by a middlebox. + */ + if ((tp->t_flags & TF_FASTOPEN) && + ((tp->t_state == TCPS_SYN_RECEIVED) && (tp->t_rxtshift > 0))) + len = 0; +#endif + if (len <= 0) { + /* + * If FIN has been sent but not acked, but we haven't been + * called to retransmit, len will be < 0. Otherwise, window + * shrank after we sent into it. If window shrank to 0, + * cancel pending retransmit, pull snd_nxt back to (closed) + * window, and set the persist timer if it isn't already + * going. If the window didn't close completely, just wait + * for an ACK. + * + * We also do a general check here to ensure that we will + * set the persist timer when we have data to send, but a + * 0-byte window. This makes sure the persist timer is set + * even if the packet hits one of the "goto send" lines + * below. + */ + len = 0; + if ((tp->snd_wnd == 0) && + (TCPS_HAVEESTABLISHED(tp->t_state)) && + (sb_offset < (int)sbavail(sb))) { + tp->snd_nxt = tp->snd_una; + rack_enter_persist(tp, rack, cts); + } + } + /* len will be >= 0 after this point. */ + KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__)); + tcp_sndbuf_autoscale(tp, so, sendwin); + /* + * Decide if we can use TCP Segmentation Offloading (if supported by + * hardware). + * + * TSO may only be used if we are in a pure bulk sending state. The + * presence of TCP-MD5, SACK retransmits, SACK advertizements and IP + * options prevent using TSO. With TSO the TCP header is the same + * (except for the sequence number) for all generated packets. This + * makes it impossible to transmit any options which vary per + * generated segment or packet. + * + * IPv4 handling has a clear separation of ip options and ip header + * flags while IPv6 combines both in in6p_outputopts. ip6_optlen() does + * the right thing below to provide length of just ip options and thus + * checking for ipoptlen is enough to decide if ip options are present. + */ + +#ifdef INET6 + if (isipv6) + ipoptlen = ip6_optlen(tp->t_inpcb); + else +#endif + if (tp->t_inpcb->inp_options) + ipoptlen = tp->t_inpcb->inp_options->m_len - + offsetof(struct ipoption, ipopt_list); + else + ipoptlen = 0; +#if defined(IPSEC) || defined(IPSEC_SUPPORT) + /* + * Pre-calculate here as we save another lookup into the darknesses + * of IPsec that way and can actually decide if TSO is ok. + */ +#ifdef INET6 + if (isipv6 && IPSEC_ENABLED(ipv6)) + ipsec_optlen = IPSEC_HDRSIZE(ipv6, tp->t_inpcb); +#ifdef INET + else +#endif +#endif /* INET6 */ +#ifdef INET + if (IPSEC_ENABLED(ipv4)) + ipsec_optlen = IPSEC_HDRSIZE(ipv4, tp->t_inpcb); +#endif /* INET */ +#endif + +#if defined(IPSEC) || defined(IPSEC_SUPPORT) + ipoptlen += ipsec_optlen; +#endif + if ((tp->t_flags & TF_TSO) && V_tcp_do_tso && len > tp->t_maxseg && + (tp->t_port == 0) && + ((tp->t_flags & TF_SIGNATURE) == 0) && + tp->rcv_numsacks == 0 && sack_rxmit == 0 && + ipoptlen == 0) + tso = 1; + { + uint32_t outstanding; + + outstanding = tp->snd_max - tp->snd_una; + if (tp->t_flags & TF_SENTFIN) { + /* + * If we sent a fin, snd_max is 1 higher than + * snd_una + */ + outstanding--; + } + if (outstanding > 0) { + /* + * This is sub-optimal. We only send a stand alone + * FIN on its own segment. + */ + if (flags & TH_FIN) { + flags &= ~TH_FIN; + would_have_fin = 1; + } + } else if (sack_rxmit) { + if ((rsm->r_flags & RACK_HAS_FIN) == 0) + flags &= ~TH_FIN; + } else { + if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + + sbused(sb))) + flags &= ~TH_FIN; + } + } + recwin = sbspace(&so->so_rcv); + + /* + * Sender silly window avoidance. We transmit under the following + * conditions when len is non-zero: + * + * - We have a full segment (or more with TSO) - This is the last + * buffer in a write()/send() and we are either idle or running + * NODELAY - we've timed out (e.g. persist timer) - we have more + * then 1/2 the maximum send window's worth of data (receiver may be + * limited the window size) - we need to retransmit + */ + if (len) { + if (len >= tp->t_maxseg) { + pass = 1; + goto send; + } + /* + * NOTE! on localhost connections an 'ack' from the remote + * end may occur synchronously with the output and cause us + * to flush a buffer queued with moretocome. XXX + * + */ + if (!(tp->t_flags & TF_MORETOCOME) && /* normal case */ + (idle || (tp->t_flags & TF_NODELAY)) && + ((uint32_t)len + (uint32_t)sb_offset >= sbavail(&so->so_snd)) && + (tp->t_flags & TF_NOPUSH) == 0) { + pass = 2; + goto send; + } + if (tp->t_flags & TF_FORCEDATA) { /* typ. timeout case */ + pass = 3; + goto send; + } + if ((tp->snd_una == tp->snd_max) && len) { /* Nothing outstanding */ + goto send; + } + if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0) { + pass = 4; + goto send; + } + if (SEQ_LT(tp->snd_nxt, tp->snd_max)) { /* retransmit case */ + pass = 5; + goto send; + } + if (sack_rxmit) { + pass = 6; + goto send; + } + } + /* + * Sending of standalone window updates. + * + * Window updates are important when we close our window due to a + * full socket buffer and are opening it again after the application + * reads data from it. Once the window has opened again and the + * remote end starts to send again the ACK clock takes over and + * provides the most current window information. + * + * We must avoid the silly window syndrome whereas every read from + * the receive buffer, no matter how small, causes a window update + * to be sent. We also should avoid sending a flurry of window + * updates when the socket buffer had queued a lot of data and the + * application is doing small reads. + * + * Prevent a flurry of pointless window updates by only sending an + * update when we can increase the advertized window by more than + * 1/4th of the socket buffer capacity. When the buffer is getting + * full or is very small be more aggressive and send an update + * whenever we can increase by two mss sized segments. In all other + * situations the ACK's to new incoming data will carry further + * window increases. + * + * Don't send an independent window update if a delayed ACK is + * pending (it will get piggy-backed on it) or the remote side + * already has done a half-close and won't send more data. Skip + * this if the connection is in T/TCP half-open state. + */ + if (recwin > 0 && !(tp->t_flags & TF_NEEDSYN) && + !(tp->t_flags & TF_DELACK) && + !TCPS_HAVERCVDFIN(tp->t_state)) { + /* + * "adv" is the amount we could increase the window, taking + * into account that we are limited by TCP_MAXWIN << + * tp->rcv_scale. + */ + int32_t adv; + int oldwin; + + adv = min(recwin, (long)TCP_MAXWIN << tp->rcv_scale); + if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt)) { + oldwin = (tp->rcv_adv - tp->rcv_nxt); + adv -= oldwin; + } else + oldwin = 0; + + /* + * If the new window size ends up being the same as the old + * size when it is scaled, then don't force a window update. + */ + if (oldwin >> tp->rcv_scale == (adv + oldwin) >> tp->rcv_scale) + goto dontupdate; + + if (adv >= (int32_t)(2 * tp->t_maxseg) && + (adv >= (int32_t)(so->so_rcv.sb_hiwat / 4) || + recwin <= (int32_t)(so->so_rcv.sb_hiwat / 8) || + so->so_rcv.sb_hiwat <= 8 * tp->t_maxseg)) { + pass = 7; + goto send; + } + if (2 * adv >= (int32_t) so->so_rcv.sb_hiwat) + goto send; + } +dontupdate: + + /* + * Send if we owe the peer an ACK, RST, SYN, or urgent data. ACKNOW + * is also a catch-all for the retransmit timer timeout case. + */ + if (tp->t_flags & TF_ACKNOW) { + pass = 8; + goto send; + } + if (((flags & TH_SYN) && (tp->t_flags & TF_NEEDSYN) == 0)) { + pass = 9; + goto send; + } + if (SEQ_GT(tp->snd_up, tp->snd_una)) { + pass = 10; + goto send; + } + /* + * If our state indicates that FIN should be sent and we have not + * yet done so, then we need to send. + */ + if (flags & TH_FIN) { + if ((tp->t_flags & TF_SENTFIN) || + (((tp->t_flags & TF_SENTFIN) == 0) && + (tp->snd_nxt == tp->snd_una))) { + pass = 11; + goto send; + } + } + /* + * No reason to send a segment, just return. + */ +just_return: + SOCKBUF_UNLOCK(sb); +just_return_nolock: + if (tot_len_this_send == 0) + counter_u64_add(rack_out_size[TCP_MSS_ACCT_JUSTRET], 1); + rack_start_hpts_timer(rack, tp, cts, __LINE__, slot, tot_len_this_send, 1); + rack_log_type_just_return(rack, cts, tot_len_this_send, slot, hpts_calling); + tp->t_flags &= ~TF_FORCEDATA; + return (0); + +send: + if (doing_tlp == 0) { + /* + * Data not a TLP, and its not the rxt firing. If it is the + * rxt firing, we want to leave the tlp_in_progress flag on + * so we don't send another TLP. It has to be a rack timer + * or normal send (response to acked data) to clear the tlp + * in progress flag. + */ + rack->rc_tlp_in_progress = 0; + } + SOCKBUF_LOCK_ASSERT(sb); + if (len > 0) { + if (len >= tp->t_maxseg) + tp->t_flags2 |= TF2_PLPMTU_MAXSEGSNT; + else + tp->t_flags2 &= ~TF2_PLPMTU_MAXSEGSNT; + } + /* + * Before ESTABLISHED, force sending of initial options unless TCP + * set not to do any options. NOTE: we assume that the IP/TCP header + * plus TCP options always fit in a single mbuf, leaving room for a + * maximum link header, i.e. max_linkhdr + sizeof (struct tcpiphdr) + * + optlen <= MCLBYTES + */ + optlen = 0; +#ifdef INET6 + if (isipv6) + hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); + else +#endif + hdrlen = sizeof(struct tcpiphdr); + + /* + * Compute options for segment. We only have to care about SYN and + * established connection segments. Options for SYN-ACK segments + * are handled in TCP syncache. + */ + to.to_flags = 0; + if ((tp->t_flags & TF_NOOPT) == 0) { + /* Maximum segment size. */ + if (flags & TH_SYN) { + tp->snd_nxt = tp->iss; + to.to_mss = tcp_mssopt(&inp->inp_inc); +#ifdef NETFLIX_TCPOUDP + if (tp->t_port) + to.to_mss -= V_tcp_udp_tunneling_overhead; +#endif + to.to_flags |= TOF_MSS; +#ifdef TCP_RFC7413 + /* + * Only include the TFO option on the first + * transmission of the SYN|ACK on a + * passively-created TFO socket, as the presence of + * the TFO option may have caused the original + * SYN|ACK to have been dropped by a middlebox. + */ + if ((tp->t_flags & TF_FASTOPEN) && + (tp->t_state == TCPS_SYN_RECEIVED) && + (tp->t_rxtshift == 0)) { + to.to_tfo_len = TCP_FASTOPEN_MAX_COOKIE_LEN; + to.to_tfo_cookie = (u_char *)&tp->t_tfo_cookie; + to.to_flags |= TOF_FASTOPEN; + } +#endif + } + /* Window scaling. */ + if ((flags & TH_SYN) && (tp->t_flags & TF_REQ_SCALE)) { + to.to_wscale = tp->request_r_scale; + to.to_flags |= TOF_SCALE; + } + /* Timestamps. */ + if ((tp->t_flags & TF_RCVD_TSTMP) || + ((flags & TH_SYN) && (tp->t_flags & TF_REQ_TSTMP))) { + to.to_tsval = cts + tp->ts_offset; + to.to_tsecr = tp->ts_recent; + to.to_flags |= TOF_TS; + } + /* Set receive buffer autosizing timestamp. */ + if (tp->rfbuf_ts == 0 && + (so->so_rcv.sb_flags & SB_AUTOSIZE)) + tp->rfbuf_ts = tcp_ts_getticks(); + /* Selective ACK's. */ + if (flags & TH_SYN) + to.to_flags |= TOF_SACKPERM; + else if (TCPS_HAVEESTABLISHED(tp->t_state) && + tp->rcv_numsacks > 0) { + to.to_flags |= TOF_SACK; + to.to_nsacks = tp->rcv_numsacks; + to.to_sacks = (u_char *)tp->sackblks; + } +#if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) + /* TCP-MD5 (RFC2385). */ + if (tp->t_flags & TF_SIGNATURE) + to.to_flags |= TOF_SIGNATURE; +#endif /* TCP_SIGNATURE */ + + /* Processing the options. */ + hdrlen += optlen = tcp_addoptions(&to, opt); + } +#ifdef NETFLIX_TCPOUDP + if (tp->t_port) { + if (V_tcp_udp_tunneling_port == 0) { + /* The port was removed?? */ + SOCKBUF_UNLOCK(&so->so_snd); + return (EHOSTUNREACH); + } + hdrlen += sizeof(struct udphdr); + } +#endif + ipoptlen = 0; +#if defined(IPSEC) || defined(IPSEC_SUPPORT) + ipoptlen += ipsec_optlen; +#endif + + /* + * Adjust data length if insertion of options will bump the packet + * length beyond the t_maxseg length. Clear the FIN bit because we + * cut off the tail of the segment. + */ + if (len + optlen + ipoptlen > tp->t_maxseg) { + if (flags & TH_FIN) { + would_have_fin = 1; + flags &= ~TH_FIN; + } + if (tso) { + uint32_t if_hw_tsomax; + uint32_t moff; + int32_t max_len; + + /* extract TSO information */ + if_hw_tsomax = tp->t_tsomax; + if_hw_tsomaxsegcount = tp->t_tsomaxsegcount; + if_hw_tsomaxsegsize = tp->t_tsomaxsegsize; + KASSERT(ipoptlen == 0, + ("%s: TSO can't do IP options", __func__)); + + /* + * Check if we should limit by maximum payload + * length: + */ + if (if_hw_tsomax != 0) { + /* compute maximum TSO length */ + max_len = (if_hw_tsomax - hdrlen - + max_linkhdr); + if (max_len <= 0) { + len = 0; + } else if (len > max_len) { + sendalot = 1; + len = max_len; + } + } + /* + * Prevent the last segment from being fractional + * unless the send sockbuf can be emptied: + */ + max_len = (tp->t_maxseg - optlen); + if ((sb_offset + len) < sbavail(sb)) { + moff = len % (u_int)max_len; + if (moff != 0) { + len -= moff; + sendalot = 1; + } + } + /* + * In case there are too many small fragments don't + * use TSO: + */ + if (len <= max_len) { + len = max_len; + sendalot = 1; + tso = 0; + } + /* + * Send the FIN in a separate segment after the bulk + * sending is done. We don't trust the TSO + * implementations to clear the FIN flag on all but + * the last segment. + */ + if (tp->t_flags & TF_NEEDFIN) + sendalot = 1; + + } else { + len = tp->t_maxseg - optlen - ipoptlen; + sendalot = 1; + } + } else + tso = 0; + KASSERT(len + hdrlen + ipoptlen <= IP_MAXPACKET, + ("%s: len > IP_MAXPACKET", __func__)); +#ifdef DIAGNOSTIC +#ifdef INET6 + if (max_linkhdr + hdrlen > MCLBYTES) +#else + if (max_linkhdr + hdrlen > MHLEN) +#endif + panic("tcphdr too big"); +#endif + + /* + * This KASSERT is here to catch edge cases at a well defined place. + * Before, those had triggered (random) panic conditions further + * down. + */ + KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__)); + if ((len == 0) && + (flags & TH_FIN) && + (sbused(sb))) { + /* + * We have outstanding data, don't send a fin by itself!. + */ + goto just_return; + } + /* + * Grab a header mbuf, attaching a copy of data to be transmitted, + * and initialize the header from the template for sends on this + * connection. + */ + if (len) { + uint32_t max_val; + uint32_t moff; + + if (rack->rc_pace_max_segs) + max_val = rack->rc_pace_max_segs * tp->t_maxseg; + else + max_val = len; + /* + * We allow a limit on sending with hptsi. + */ + if (len > max_val) { + len = max_val; + } +#ifdef INET6 + if (MHLEN < hdrlen + max_linkhdr) + m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); + else +#endif + m = m_gethdr(M_NOWAIT, MT_DATA); + + if (m == NULL) { + SOCKBUF_UNLOCK(sb); + error = ENOBUFS; + sack_rxmit = 0; + goto out; + } + m->m_data += max_linkhdr; + m->m_len = hdrlen; + + /* + * Start the m_copy functions from the closest mbuf to the + * sb_offset in the socket buffer chain. + */ + mb = sbsndptr_noadv(sb, sb_offset, &moff); + if (len <= MHLEN - hdrlen - max_linkhdr) { + m_copydata(mb, moff, (int)len, + mtod(m, caddr_t)+hdrlen); + if (SEQ_LT(tp->snd_nxt, tp->snd_max)) + sbsndptr_adv(sb, mb, len); + m->m_len += len; + } else { + struct sockbuf *msb; + + if (SEQ_LT(tp->snd_nxt, tp->snd_max)) + msb = NULL; + else + msb = sb; + m->m_next = tcp_m_copym(mb, moff, &len, + if_hw_tsomaxsegcount, if_hw_tsomaxsegsize, msb); + if (len <= (tp->t_maxseg - optlen)) { + /* + * Must have ran out of mbufs for the copy + * shorten it to no longer need tso. Lets + * not put on sendalot since we are low on + * mbufs. + */ + tso = 0; + } + if (m->m_next == NULL) { + SOCKBUF_UNLOCK(sb); + (void)m_free(m); + error = ENOBUFS; + sack_rxmit = 0; + goto out; + } + } + if ((tp->t_flags & TF_FORCEDATA) && len == 1) { + TCPSTAT_INC(tcps_sndprobe); +#ifdef NETFLIX_STATS + if (SEQ_LT(tp->snd_nxt, tp->snd_max)) + stats_voi_update_abs_u32(tp->t_stats, + VOI_TCP_RETXPB, len); + else + stats_voi_update_abs_u64(tp->t_stats, + VOI_TCP_TXPB, len); +#endif + } else if (SEQ_LT(tp->snd_nxt, tp->snd_max) || sack_rxmit) { + if (rsm && (rsm->r_flags & RACK_TLP)) { + /* + * TLP should not count in retran count, but + * in its own bin + */ + counter_u64_add(rack_tlp_retran, 1); + counter_u64_add(rack_tlp_retran_bytes, len); + } else { + tp->t_sndrexmitpack++; + TCPSTAT_INC(tcps_sndrexmitpack); + TCPSTAT_ADD(tcps_sndrexmitbyte, len); + } +#ifdef NETFLIX_STATS + stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RETXPB, + len); +#endif + } else { + TCPSTAT_INC(tcps_sndpack); + TCPSTAT_ADD(tcps_sndbyte, len); +#ifdef NETFLIX_STATS + stats_voi_update_abs_u64(tp->t_stats, VOI_TCP_TXPB, + len); +#endif + } + /* + * If we're sending everything we've got, set PUSH. (This + * will keep happy those implementations which only give + * data to the user when a buffer fills or a PUSH comes in.) + */ + if (sb_offset + len == sbused(sb) && + sbused(sb) && + !(flags & TH_SYN)) + flags |= TH_PUSH; + + /* + * Are we doing hptsi, if so we must calculate the slot. We + * only do hptsi in ESTABLISHED and with no RESET being + * sent where we have data to send. + */ + if (((tp->t_state == TCPS_ESTABLISHED) || + (tp->t_state == TCPS_CLOSE_WAIT) || + ((tp->t_state == TCPS_FIN_WAIT_1) && + ((tp->t_flags & TF_SENTFIN) == 0) && + ((flags & TH_FIN) == 0))) && + ((flags & TH_RST) == 0) && + (rack->rc_always_pace)) { + /* + * We use the most optimistic possible cwnd/srtt for + * sending calculations. This will make our + * calculation anticipate getting more through + * quicker then possible. But thats ok we don't want + * the peer to have a gap in data sending. + */ + uint32_t srtt, cwnd, tr_perms = 0; + + if (rack->r_ctl.rc_rack_min_rtt) + srtt = rack->r_ctl.rc_rack_min_rtt; + else + srtt = TICKS_2_MSEC((tp->t_srtt >> TCP_RTT_SHIFT)); + if (rack->r_ctl.rc_rack_largest_cwnd) + cwnd = rack->r_ctl.rc_rack_largest_cwnd; + else + cwnd = tp->snd_cwnd; + tr_perms = cwnd / srtt; + if (tr_perms == 0) { + tr_perms = tp->t_maxseg; + } + tot_len_this_send += len; + /* + * Calculate how long this will take to drain, if + * the calculation comes out to zero, thats ok we + * will use send_a_lot to possibly spin around for + * more increasing tot_len_this_send to the point + * that its going to require a pace, or we hit the + * cwnd. Which in that case we are just waiting for + * a ACK. + */ + slot = tot_len_this_send / tr_perms; + /* Now do we reduce the time so we don't run dry? */ + if (slot && rack->rc_pace_reduce) { + int32_t reduce; + + reduce = (slot / rack->rc_pace_reduce); + if (reduce < slot) { + slot -= reduce; + } else + slot = 0; + } + if (rack->r_enforce_min_pace && + (slot == 0) && + (tot_len_this_send >= (rack->r_min_pace_seg_thresh * tp->t_maxseg))) { + /* We are enforcing a minimum pace time of 1ms */ + slot = rack->r_enforce_min_pace; + } + } + SOCKBUF_UNLOCK(sb); + } else { + SOCKBUF_UNLOCK(sb); + if (tp->t_flags & TF_ACKNOW) + TCPSTAT_INC(tcps_sndacks); + else if (flags & (TH_SYN | TH_FIN | TH_RST)) + TCPSTAT_INC(tcps_sndctrl); + else if (SEQ_GT(tp->snd_up, tp->snd_una)) + TCPSTAT_INC(tcps_sndurg); + else + TCPSTAT_INC(tcps_sndwinup); + + m = m_gethdr(M_NOWAIT, MT_DATA); + if (m == NULL) { + error = ENOBUFS; + sack_rxmit = 0; + goto out; + } +#ifdef INET6 + if (isipv6 && (MHLEN < hdrlen + max_linkhdr) && + MHLEN >= hdrlen) { + M_ALIGN(m, hdrlen); + } else +#endif + m->m_data += max_linkhdr; + m->m_len = hdrlen; + } + SOCKBUF_UNLOCK_ASSERT(sb); + m->m_pkthdr.rcvif = (struct ifnet *)0; +#ifdef MAC + mac_inpcb_create_mbuf(inp, m); +#endif +#ifdef INET6 + if (isipv6) { + ip6 = mtod(m, struct ip6_hdr *); +#ifdef NETFLIX_TCPOUDP + if (tp->t_port) { + udp = (struct udphdr *)((caddr_t)ip6 + ipoptlen + sizeof(struct ip6_hdr)); + udp->uh_sport = htons(V_tcp_udp_tunneling_port); + udp->uh_dport = tp->t_port; + ulen = hdrlen + len - sizeof(struct ip6_hdr); + udp->uh_ulen = htons(ulen); + th = (struct tcphdr *)(udp + 1); + } else +#endif + th = (struct tcphdr *)(ip6 + 1); + tcpip_fillheaders(inp, ip6, th); + } else +#endif /* INET6 */ + { + ip = mtod(m, struct ip *); +#ifdef TCPDEBUG + ipov = (struct ipovly *)ip; +#endif +#ifdef NETFLIX_TCPOUDP + if (tp->t_port) { + udp = (struct udphdr *)((caddr_t)ip + ipoptlen + sizeof(struct ip)); + udp->uh_sport = htons(V_tcp_udp_tunneling_port); + udp->uh_dport = tp->t_port; + ulen = hdrlen + len - sizeof(struct ip); + udp->uh_ulen = htons(ulen); + th = (struct tcphdr *)(udp + 1); + } else +#endif + th = (struct tcphdr *)(ip + 1); + tcpip_fillheaders(inp, ip, th); + } + /* + * Fill in fields, remembering maximum advertised window for use in + * delaying messages about window sizes. If resending a FIN, be sure + * not to use a new sequence number. + */ + if (flags & TH_FIN && tp->t_flags & TF_SENTFIN && + tp->snd_nxt == tp->snd_max) + tp->snd_nxt--; + /* + * If we are starting a connection, send ECN setup SYN packet. If we + * are on a retransmit, we may resend those bits a number of times + * as per RFC 3168. + */ + if (tp->t_state == TCPS_SYN_SENT && V_tcp_do_ecn == 1) { + if (tp->t_rxtshift >= 1) { + if (tp->t_rxtshift <= V_tcp_ecn_maxretries) + flags |= TH_ECE | TH_CWR; + } else + flags |= TH_ECE | TH_CWR; + } + if (tp->t_state == TCPS_ESTABLISHED && + (tp->t_flags & TF_ECN_PERMIT)) { + /* + * If the peer has ECN, mark data packets with ECN capable + * transmission (ECT). Ignore pure ack packets, + * retransmissions and window probes. + */ + if (len > 0 && SEQ_GEQ(tp->snd_nxt, tp->snd_max) && + !((tp->t_flags & TF_FORCEDATA) && len == 1)) { +#ifdef INET6 + if (isipv6) + ip6->ip6_flow |= htonl(IPTOS_ECN_ECT0 << 20); + else +#endif + ip->ip_tos |= IPTOS_ECN_ECT0; + TCPSTAT_INC(tcps_ecn_ect0); + } + /* + * Reply with proper ECN notifications. + */ + if (tp->t_flags & TF_ECN_SND_CWR) { + flags |= TH_CWR; + tp->t_flags &= ~TF_ECN_SND_CWR; + } + if (tp->t_flags & TF_ECN_SND_ECE) + flags |= TH_ECE; + } + /* + * If we are doing retransmissions, then snd_nxt will not reflect + * the first unsent octet. For ACK only packets, we do not want the + * sequence number of the retransmitted packet, we want the sequence + * number of the next unsent octet. So, if there is no data (and no + * SYN or FIN), use snd_max instead of snd_nxt when filling in + * ti_seq. But if we are in persist state, snd_max might reflect + * one byte beyond the right edge of the window, so use snd_nxt in + * that case, since we know we aren't doing a retransmission. + * (retransmit and persist are mutually exclusive...) + */ + if (sack_rxmit == 0) { + if (len || (flags & (TH_SYN | TH_FIN)) || + rack->rc_in_persist) { + th->th_seq = htonl(tp->snd_nxt); + rack_seq = tp->snd_nxt; + } else if (flags & TH_RST) { + /* + * For a Reset send the last cum ack in sequence + * (this like any other choice may still generate a + * challenge ack, if a ack-update packet is in + * flight). + */ + th->th_seq = htonl(tp->snd_una); + rack_seq = tp->snd_una; + } else { + th->th_seq = htonl(tp->snd_max); + rack_seq = tp->snd_max; + } + } else { + th->th_seq = htonl(rsm->r_start); + rack_seq = rsm->r_start; + } + th->th_ack = htonl(tp->rcv_nxt); + if (optlen) { + bcopy(opt, th + 1, optlen); + th->th_off = (sizeof(struct tcphdr) + optlen) >> 2; + } + th->th_flags = flags; + /* + * Calculate receive window. Don't shrink window, but avoid silly + * window syndrome. + */ + if (recwin < (long)(so->so_rcv.sb_hiwat / 4) && + recwin < (long)tp->t_maxseg) + recwin = 0; + if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt) && + recwin < (long)(tp->rcv_adv - tp->rcv_nxt)) + recwin = (long)(tp->rcv_adv - tp->rcv_nxt); + if (recwin > (long)TCP_MAXWIN << tp->rcv_scale) + recwin = (long)TCP_MAXWIN << tp->rcv_scale; + + /* + * According to RFC1323 the window field in a SYN (i.e., a or + * ) segment itself is never scaled. The case is + * handled in syncache. + */ + if (flags & TH_SYN) + th->th_win = htons((u_short) + (min(sbspace(&so->so_rcv), TCP_MAXWIN))); + else + th->th_win = htons((u_short)(recwin >> tp->rcv_scale)); + /* + * Adjust the RXWIN0SENT flag - indicate that we have advertised a 0 + * window. This may cause the remote transmitter to stall. This + * flag tells soreceive() to disable delayed acknowledgements when + * draining the buffer. This can occur if the receiver is + * attempting to read more data than can be buffered prior to + * transmitting on the connection. + */ + if (th->th_win == 0) { + tp->t_sndzerowin++; + tp->t_flags |= TF_RXWIN0SENT; + } else + tp->t_flags &= ~TF_RXWIN0SENT; + if (SEQ_GT(tp->snd_up, tp->snd_nxt)) { + th->th_urp = htons((u_short)(tp->snd_up - tp->snd_nxt)); + th->th_flags |= TH_URG; + } else + /* + * If no urgent pointer to send, then we pull the urgent + * pointer to the left edge of the send window so that it + * doesn't drift into the send window on sequence number + * wraparound. + */ + tp->snd_up = tp->snd_una; /* drag it along */ + +#if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) + if (to.to_flags & TOF_SIGNATURE) { + /* + * Calculate MD5 signature and put it into the place + * determined before. + * NOTE: since TCP options buffer doesn't point into + * mbuf's data, calculate offset and use it. + */ + if (!TCPMD5_ENABLED() || TCPMD5_OUTPUT(m, th, + (u_char *)(th + 1) + (to.to_signature - opt)) != 0) { + /* + * Do not send segment if the calculation of MD5 + * digest has failed. + */ + goto out; + } + } +#endif + + /* + * Put TCP length in extended header, and then checksum extended + * header and data. + */ + m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */ +#ifdef INET6 + if (isipv6) { + /* + * ip6_plen is not need to be filled now, and will be filled + * in ip6_output. + */ + if (tp->t_port) { + m->m_pkthdr.csum_flags = CSUM_UDP_IPV6; + m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); + udp->uh_sum = in6_cksum_pseudo(ip6, ulen, IPPROTO_UDP, 0); + th->th_sum = htons(0); + } else { + m->m_pkthdr.csum_flags = CSUM_TCP_IPV6; + m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); + th->th_sum = in6_cksum_pseudo(ip6, + sizeof(struct tcphdr) + optlen + len, IPPROTO_TCP, + 0); + } + } +#endif +#if defined(INET6) && defined(INET) + else +#endif +#ifdef INET + { + if (tp->t_port) { + m->m_pkthdr.csum_flags = CSUM_UDP; + m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); + udp->uh_sum = in_pseudo(ip->ip_src.s_addr, + ip->ip_dst.s_addr, htons(ulen + IPPROTO_UDP)); + th->th_sum = htons(0); + } else { + m->m_pkthdr.csum_flags = CSUM_TCP; + m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); + th->th_sum = in_pseudo(ip->ip_src.s_addr, + ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) + + IPPROTO_TCP + len + optlen)); + } + /* IP version must be set here for ipv4/ipv6 checking later */ + KASSERT(ip->ip_v == IPVERSION, + ("%s: IP version incorrect: %d", __func__, ip->ip_v)); + } +#endif + + /* + * Enable TSO and specify the size of the segments. The TCP pseudo + * header checksum is always provided. XXX: Fixme: This is currently + * not the case for IPv6. + */ + if (tso) { + KASSERT(len > tp->t_maxseg - optlen, + ("%s: len <= tso_segsz", __func__)); + m->m_pkthdr.csum_flags |= CSUM_TSO; + m->m_pkthdr.tso_segsz = tp->t_maxseg - optlen; + } +#if defined(IPSEC) || defined(IPSEC_SUPPORT) + KASSERT(len + hdrlen + ipoptlen - ipsec_optlen == m_length(m, NULL), + ("%s: mbuf chain shorter than expected: %d + %u + %u - %u != %u", + __func__, len, hdrlen, ipoptlen, ipsec_optlen, m_length(m, NULL))); +#else + KASSERT(len + hdrlen + ipoptlen == m_length(m, NULL), + ("%s: mbuf chain shorter than expected: %d + %u + %u != %u", + __func__, len, hdrlen, ipoptlen, m_length(m, NULL))); +#endif + +#ifdef TCP_HHOOK + /* Run HHOOK_TCP_ESTABLISHED_OUT helper hooks. */ + hhook_run_tcp_est_out(tp, th, &to, len, tso); +#endif + +#ifdef TCPDEBUG + /* + * Trace. + */ + if (so->so_options & SO_DEBUG) { + u_short save = 0; + +#ifdef INET6 + if (!isipv6) +#endif + { + save = ipov->ih_len; + ipov->ih_len = htons(m->m_pkthdr.len /* - hdrlen + + * (th->th_off << 2) */ ); + } + tcp_trace(TA_OUTPUT, tp->t_state, tp, mtod(m, void *), th, 0); +#ifdef INET6 + if (!isipv6) +#endif + ipov->ih_len = save; + } +#endif /* TCPDEBUG */ + + /* We're getting ready to send; log now. */ + if (tp->t_logstate != TCP_LOG_STATE_OFF) { + union tcp_log_stackspecific log; + + memset(&log.u_bbr, 0, sizeof(log.u_bbr)); + log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; + log.u_bbr.ininput = rack->rc_inp->inp_in_input; + log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt; + if (rsm || sack_rxmit) { + log.u_bbr.flex8 = 1; + } else { + log.u_bbr.flex8 = 0; + } + lgb = tcp_log_event_(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_OUT, ERRNO_UNK, + len, &log, false, NULL, NULL, 0, NULL); + } else + lgb = NULL; + + /* + * Fill in IP length and desired time to live and send to IP level. + * There should be a better way to handle ttl and tos; we could keep + * them in the template, but need a way to checksum without them. + */ + /* + * m->m_pkthdr.len should have been set before cksum calcuration, + * because in6_cksum() need it. + */ +#ifdef INET6 + if (isipv6) { + /* + * we separately set hoplimit for every segment, since the + * user might want to change the value via setsockopt. Also, + * desired default hop limit might be changed via Neighbor + * Discovery. + */ + ip6->ip6_hlim = in6_selecthlim(inp, NULL); + + /* + * Set the packet size here for the benefit of DTrace + * probes. ip6_output() will set it properly; it's supposed + * to include the option header lengths as well. + */ + ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(*ip6)); + + if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) + tp->t_flags2 |= TF2_PLPMTU_PMTUD; + else + tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; + + if (tp->t_state == TCPS_SYN_SENT) + TCP_PROBE5(connect__request, NULL, tp, ip6, tp, th); + + TCP_PROBE5(send, NULL, tp, ip6, tp, th); + /* TODO: IPv6 IP6TOS_ECT bit on */ + error = ip6_output(m, tp->t_inpcb->in6p_outputopts, + &inp->inp_route6, + ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0), + NULL, NULL, inp); + + if (error == EMSGSIZE && inp->inp_route6.ro_rt != NULL) + mtu = inp->inp_route6.ro_rt->rt_mtu; + } +#endif /* INET6 */ +#if defined(INET) && defined(INET6) + else +#endif +#ifdef INET + { + ip->ip_len = htons(m->m_pkthdr.len); +#ifdef INET6 + if (inp->inp_vflag & INP_IPV6PROTO) + ip->ip_ttl = in6_selecthlim(inp, NULL); +#endif /* INET6 */ + /* + * If we do path MTU discovery, then we set DF on every + * packet. This might not be the best thing to do according + * to RFC3390 Section 2. However the tcp hostcache migitates + * the problem so it affects only the first tcp connection + * with a host. + * + * NB: Don't set DF on small MTU/MSS to have a safe + * fallback. + */ + if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) { + tp->t_flags2 |= TF2_PLPMTU_PMTUD; + if (tp->t_port == 0 || len < V_tcp_minmss) { + ip->ip_off |= htons(IP_DF); + } + } else { + tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; + } + + if (tp->t_state == TCPS_SYN_SENT) + TCP_PROBE5(connect__request, NULL, tp, ip, tp, th); + + TCP_PROBE5(send, NULL, tp, ip, tp, th); + + error = ip_output(m, tp->t_inpcb->inp_options, &inp->inp_route, + ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0), 0, + inp); + if (error == EMSGSIZE && inp->inp_route.ro_rt != NULL) + mtu = inp->inp_route.ro_rt->rt_mtu; + } +#endif /* INET */ + +out: + if (lgb) { + lgb->tlb_errno = error; + lgb = NULL; + } + /* + * In transmit state, time the transmission and arrange for the + * retransmit. In persist state, just set snd_max. + */ + if (error == 0) { + if (len == 0) + counter_u64_add(rack_out_size[TCP_MSS_ACCT_SNDACK], 1); + else if (len == 1) { + counter_u64_add(rack_out_size[TCP_MSS_ACCT_PERSIST], 1); + } else if (len > 1) { + int idx; + + idx = (len / tp->t_maxseg) + 3; + if (idx >= TCP_MSS_ACCT_ATIMER) + counter_u64_add(rack_out_size[(TCP_MSS_ACCT_ATIMER-1)], 1); + else + counter_u64_add(rack_out_size[idx], 1); + } + } + if (sub_from_prr && (error == 0)) { + rack->r_ctl.rc_prr_sndcnt -= len; + } + sub_from_prr = 0; + rack_log_output(tp, &to, len, rack_seq, (uint8_t) flags, error, cts, + pass, rsm); + if ((tp->t_flags & TF_FORCEDATA) == 0 || + (rack->rc_in_persist == 0)) { +#ifdef NETFLIX_STATS + tcp_seq startseq = tp->snd_nxt; +#endif + + /* + * Advance snd_nxt over sequence space of this segment. + */ + if (error) + /* We don't log or do anything with errors */ + goto timer; + + if (flags & (TH_SYN | TH_FIN)) { + if (flags & TH_SYN) + tp->snd_nxt++; + if (flags & TH_FIN) { + tp->snd_nxt++; + tp->t_flags |= TF_SENTFIN; + } + } + /* In the ENOBUFS case we do *not* update snd_max */ + if (sack_rxmit) + goto timer; + + tp->snd_nxt += len; + if (SEQ_GT(tp->snd_nxt, tp->snd_max)) { + if (tp->snd_una == tp->snd_max) { + /* + * Update the time we just added data since + * none was outstanding. + */ + rack_log_progress_event(rack, tp, ticks, PROGRESS_START, __LINE__); + tp->t_acktime = ticks; + } + tp->snd_max = tp->snd_nxt; +#ifdef NETFLIX_STATS + if (!(tp->t_flags & TF_GPUTINPROG) && len) { + tp->t_flags |= TF_GPUTINPROG; + tp->gput_seq = startseq; + tp->gput_ack = startseq + + ulmin(sbavail(sb) - sb_offset, sendwin); + tp->gput_ts = tcp_ts_getticks(); + } +#endif + } + /* + * Set retransmit timer if not currently set, and not doing + * a pure ack or a keep-alive probe. Initial value for + * retransmit timer is smoothed round-trip time + 2 * + * round-trip time variance. Initialize shift counter which + * is used for backoff of retransmit time. + */ +timer: + if ((tp->snd_wnd == 0) && + TCPS_HAVEESTABLISHED(tp->t_state)) { + /* + * If the persists timer was set above (right before + * the goto send), and still needs to be on. Lets + * make sure all is canceled. If the persist timer + * is not running, we want to get it up. + */ + if (rack->rc_in_persist == 0) { + rack_enter_persist(tp, rack, cts); + } + } + } else { + /* + * Persist case, update snd_max but since we are in persist + * mode (no window) we do not update snd_nxt. + */ + int32_t xlen = len; + + if (error) + goto nomore; + + if (flags & TH_SYN) + ++xlen; + if (flags & TH_FIN) { + ++xlen; + tp->t_flags |= TF_SENTFIN; + } + /* In the ENOBUFS case we do *not* update snd_max */ + if (SEQ_GT(tp->snd_nxt + xlen, tp->snd_max)) { + if (tp->snd_una == tp->snd_max) { + /* + * Update the time we just added data since + * none was outstanding. + */ + rack_log_progress_event(rack, tp, ticks, PROGRESS_START, __LINE__); + tp->t_acktime = ticks; + } + tp->snd_max = tp->snd_nxt + len; + } + } +nomore: + if (error) { + SOCKBUF_UNLOCK_ASSERT(sb); /* Check gotos. */ + /* + * Failures do not advance the seq counter above. For the + * case of ENOBUFS we will fall out and retry in 1ms with + * the hpts. Everything else will just have to retransmit + * with the timer. + * + * In any case, we do not want to loop around for another + * send without a good reason. + */ + sendalot = 0; + switch (error) { + case EPERM: + tp->t_flags &= ~TF_FORCEDATA; + tp->t_softerror = error; + return (error); + case ENOBUFS: + if (slot == 0) { + /* + * Pace us right away to retry in a some + * time + */ + slot = 1 + rack->rc_enobuf; + if (rack->rc_enobuf < 255) + rack->rc_enobuf++; + if (slot > (rack->rc_rack_rtt / 2)) { + slot = rack->rc_rack_rtt / 2; + } + if (slot < 10) + slot = 10; + } + counter_u64_add(rack_saw_enobuf, 1); + error = 0; + goto enobufs; + case EMSGSIZE: + /* + * For some reason the interface we used initially + * to send segments changed to another or lowered + * its MTU. If TSO was active we either got an + * interface without TSO capabilits or TSO was + * turned off. If we obtained mtu from ip_output() + * then update it and try again. + */ + if (tso) + tp->t_flags &= ~TF_TSO; + if (mtu != 0) { + tcp_mss_update(tp, -1, mtu, NULL, NULL); + goto again; + } + slot = 10; + rack_start_hpts_timer(rack, tp, cts, __LINE__, slot, 0, 1); + tp->t_flags &= ~TF_FORCEDATA; + return (error); + case ENETUNREACH: + counter_u64_add(rack_saw_enetunreach, 1); + case EHOSTDOWN: + case EHOSTUNREACH: + case ENETDOWN: + if (TCPS_HAVERCVDSYN(tp->t_state)) { + tp->t_softerror = error; + } + /* FALLTHROUGH */ + default: + slot = 10; + rack_start_hpts_timer(rack, tp, cts, __LINE__, slot, 0, 1); + tp->t_flags &= ~TF_FORCEDATA; + return (error); + } + } else { + rack->rc_enobuf = 0; + } + TCPSTAT_INC(tcps_sndtotal); + + /* + * Data sent (as far as we can tell). If this advertises a larger + * window than any other segment, then remember the size of the + * advertised window. Any pending ACK has now been sent. + */ + if (recwin > 0 && SEQ_GT(tp->rcv_nxt + recwin, tp->rcv_adv)) + tp->rcv_adv = tp->rcv_nxt + recwin; + tp->last_ack_sent = tp->rcv_nxt; + tp->t_flags &= ~(TF_ACKNOW | TF_DELACK); +enobufs: + rack->r_tlp_running = 0; + if ((flags & TH_RST) || (would_have_fin == 1)) { + /* + * We don't send again after a RST. We also do *not* send + * again if we would have had a find, but now have + * outstanding data. + */ + slot = 0; + sendalot = 0; + } + if (slot) { + /* set the rack tcb into the slot N */ + counter_u64_add(rack_paced_segments, 1); + } else if (sendalot) { + if (len) + counter_u64_add(rack_unpaced_segments, 1); + sack_rxmit = 0; + tp->t_flags &= ~TF_FORCEDATA; + goto again; + } else if (len) { + counter_u64_add(rack_unpaced_segments, 1); + } + tp->t_flags &= ~TF_FORCEDATA; + rack_start_hpts_timer(rack, tp, cts, __LINE__, slot, tot_len_this_send, 1); + return (error); +} + +/* + * rack_ctloutput() must drop the inpcb lock before performing copyin on + * socket option arguments. When it re-acquires the lock after the copy, it + * has to revalidate that the connection is still valid for the socket + * option. + */ +static int +rack_set_sockopt(struct socket *so, struct sockopt *sopt, + struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack) +{ + int32_t error = 0, optval; + + switch (sopt->sopt_name) { + case TCP_RACK_PROP_RATE: + case TCP_RACK_PROP: + case TCP_RACK_TLP_REDUCE: + case TCP_RACK_EARLY_RECOV: + case TCP_RACK_PACE_ALWAYS: + case TCP_DELACK: + case TCP_RACK_PACE_REDUCE: + case TCP_RACK_PACE_MAX_SEG: + case TCP_RACK_PRR_SENDALOT: + case TCP_RACK_MIN_TO: + case TCP_RACK_EARLY_SEG: + case TCP_RACK_REORD_THRESH: + case TCP_RACK_REORD_FADE: + case TCP_RACK_TLP_THRESH: + case TCP_RACK_PKT_DELAY: + case TCP_RACK_TLP_USE: + case TCP_RACK_TLP_INC_VAR: + case TCP_RACK_IDLE_REDUCE_HIGH: + case TCP_RACK_MIN_PACE: + case TCP_RACK_MIN_PACE_SEG: + case TCP_BBR_RACK_RTT_USE: + case TCP_DATA_AFTER_CLOSE: + break; + default: + return (tcp_default_ctloutput(so, sopt, inp, tp)); + break; + } + INP_WUNLOCK(inp); + error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval)); + if (error) + return (error); + INP_WLOCK(inp); + if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { + INP_WUNLOCK(inp); + return (ECONNRESET); + } + tp = intotcpcb(inp); + rack = (struct tcp_rack *)tp->t_fb_ptr; + switch (sopt->sopt_name) { + case TCP_RACK_PROP_RATE: + if ((optval <= 0) || (optval >= 100)) { + error = EINVAL; + break; + } + RACK_OPTS_INC(tcp_rack_prop_rate); + rack->r_ctl.rc_prop_rate = optval; + break; + case TCP_RACK_TLP_USE: + if ((optval < TLP_USE_ID) || (optval > TLP_USE_TWO_TWO)) { + error = EINVAL; + break; + } + RACK_OPTS_INC(tcp_tlp_use); + rack->rack_tlp_threshold_use = optval; + break; + case TCP_RACK_PROP: + /* RACK proportional rate reduction (bool) */ + RACK_OPTS_INC(tcp_rack_prop); + rack->r_ctl.rc_prop_reduce = optval; + break; + case TCP_RACK_TLP_REDUCE: + /* RACK TLP cwnd reduction (bool) */ + RACK_OPTS_INC(tcp_rack_tlp_reduce); + rack->r_ctl.rc_tlp_cwnd_reduce = optval; + break; + case TCP_RACK_EARLY_RECOV: + /* Should recovery happen early (bool) */ + RACK_OPTS_INC(tcp_rack_early_recov); + rack->r_ctl.rc_early_recovery = optval; + break; + case TCP_RACK_PACE_ALWAYS: + /* Use the always pace method (bool) */ + RACK_OPTS_INC(tcp_rack_pace_always); + if (optval > 0) + rack->rc_always_pace = 1; + else + rack->rc_always_pace = 0; + break; + case TCP_RACK_PACE_REDUCE: + /* RACK Hptsi reduction factor (divisor) */ + RACK_OPTS_INC(tcp_rack_pace_reduce); + if (optval) + /* Must be non-zero */ + rack->rc_pace_reduce = optval; + else + error = EINVAL; + break; + case TCP_RACK_PACE_MAX_SEG: + /* Max segments in a pace */ + RACK_OPTS_INC(tcp_rack_max_seg); + rack->rc_pace_max_segs = optval; + break; + case TCP_RACK_PRR_SENDALOT: + /* Allow PRR to send more than one seg */ + RACK_OPTS_INC(tcp_rack_prr_sendalot); + rack->r_ctl.rc_prr_sendalot = optval; + break; + case TCP_RACK_MIN_TO: + /* Minimum time between rack t-o's in ms */ + RACK_OPTS_INC(tcp_rack_min_to); + rack->r_ctl.rc_min_to = optval; + break; + case TCP_RACK_EARLY_SEG: + /* If early recovery max segments */ + RACK_OPTS_INC(tcp_rack_early_seg); + rack->r_ctl.rc_early_recovery_segs = optval; + break; + case TCP_RACK_REORD_THRESH: + /* RACK reorder threshold (shift amount) */ + RACK_OPTS_INC(tcp_rack_reord_thresh); + if ((optval > 0) && (optval < 31)) + rack->r_ctl.rc_reorder_shift = optval; + else + error = EINVAL; + break; + case TCP_RACK_REORD_FADE: + /* Does reordering fade after ms time */ + RACK_OPTS_INC(tcp_rack_reord_fade); + rack->r_ctl.rc_reorder_fade = optval; + break; + case TCP_RACK_TLP_THRESH: + /* RACK TLP theshold i.e. srtt+(srtt/N) */ + RACK_OPTS_INC(tcp_rack_tlp_thresh); + if (optval) + rack->r_ctl.rc_tlp_threshold = optval; + else + error = EINVAL; + break; + case TCP_RACK_PKT_DELAY: + /* RACK added ms i.e. rack-rtt + reord + N */ + RACK_OPTS_INC(tcp_rack_pkt_delay); + rack->r_ctl.rc_pkt_delay = optval; + break; + case TCP_RACK_TLP_INC_VAR: + /* Does TLP include rtt variance in t-o */ + RACK_OPTS_INC(tcp_rack_tlp_inc_var); + rack->r_ctl.rc_prr_inc_var = optval; + break; + case TCP_RACK_IDLE_REDUCE_HIGH: + RACK_OPTS_INC(tcp_rack_idle_reduce_high); + if (optval) + rack->r_idle_reduce_largest = 1; + else + rack->r_idle_reduce_largest = 0; + break; + case TCP_DELACK: + if (optval == 0) + tp->t_delayed_ack = 0; + else + tp->t_delayed_ack = 1; + if (tp->t_flags & TF_DELACK) { + tp->t_flags &= ~TF_DELACK; + tp->t_flags |= TF_ACKNOW; + rack_output(tp); + } + break; + case TCP_RACK_MIN_PACE: + RACK_OPTS_INC(tcp_rack_min_pace); + if (optval > 3) + rack->r_enforce_min_pace = 3; + else + rack->r_enforce_min_pace = optval; + break; + case TCP_RACK_MIN_PACE_SEG: + RACK_OPTS_INC(tcp_rack_min_pace_seg); + if (optval >= 16) + rack->r_min_pace_seg_thresh = 15; + else + rack->r_min_pace_seg_thresh = optval; + break; + case TCP_BBR_RACK_RTT_USE: + if ((optval != USE_RTT_HIGH) && + (optval != USE_RTT_LOW) && + (optval != USE_RTT_AVG)) + error = EINVAL; + else + rack->r_ctl.rc_rate_sample_method = optval; + break; + case TCP_DATA_AFTER_CLOSE: + if (optval) + rack->rc_allow_data_af_clo = 1; + else + rack->rc_allow_data_af_clo = 0; + break; + default: + return (tcp_default_ctloutput(so, sopt, inp, tp)); + break; + } +#ifdef NETFLIX_STATS + tcp_log_socket_option(tp, sopt->sopt_name, optval, error); +#endif + INP_WUNLOCK(inp); + return (error); +} + +static int +rack_get_sockopt(struct socket *so, struct sockopt *sopt, + struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack) +{ + int32_t error, optval; + + /* + * Because all our options are either boolean or an int, we can just + * pull everything into optval and then unlock and copy. If we ever + * add a option that is not a int, then this will have quite an + * impact to this routine. + */ + switch (sopt->sopt_name) { + case TCP_RACK_PROP_RATE: + optval = rack->r_ctl.rc_prop_rate; + break; + case TCP_RACK_PROP: + /* RACK proportional rate reduction (bool) */ + optval = rack->r_ctl.rc_prop_reduce; + break; + case TCP_RACK_TLP_REDUCE: + /* RACK TLP cwnd reduction (bool) */ + optval = rack->r_ctl.rc_tlp_cwnd_reduce; + break; + case TCP_RACK_EARLY_RECOV: + /* Should recovery happen early (bool) */ + optval = rack->r_ctl.rc_early_recovery; + break; + case TCP_RACK_PACE_REDUCE: + /* RACK Hptsi reduction factor (divisor) */ + optval = rack->rc_pace_reduce; + break; + case TCP_RACK_PACE_MAX_SEG: + /* Max segments in a pace */ + optval = rack->rc_pace_max_segs; + break; + case TCP_RACK_PACE_ALWAYS: + /* Use the always pace method */ + optval = rack->rc_always_pace; + break; + case TCP_RACK_PRR_SENDALOT: + /* Allow PRR to send more than one seg */ + optval = rack->r_ctl.rc_prr_sendalot; + break; + case TCP_RACK_MIN_TO: + /* Minimum time between rack t-o's in ms */ + optval = rack->r_ctl.rc_min_to; + break; + case TCP_RACK_EARLY_SEG: + /* If early recovery max segments */ + optval = rack->r_ctl.rc_early_recovery_segs; + break; + case TCP_RACK_REORD_THRESH: + /* RACK reorder threshold (shift amount) */ + optval = rack->r_ctl.rc_reorder_shift; + break; + case TCP_RACK_REORD_FADE: + /* Does reordering fade after ms time */ + optval = rack->r_ctl.rc_reorder_fade; + break; + case TCP_RACK_TLP_THRESH: + /* RACK TLP theshold i.e. srtt+(srtt/N) */ + optval = rack->r_ctl.rc_tlp_threshold; + break; + case TCP_RACK_PKT_DELAY: + /* RACK added ms i.e. rack-rtt + reord + N */ + optval = rack->r_ctl.rc_pkt_delay; + break; + case TCP_RACK_TLP_USE: + optval = rack->rack_tlp_threshold_use; + break; + case TCP_RACK_TLP_INC_VAR: + /* Does TLP include rtt variance in t-o */ + optval = rack->r_ctl.rc_prr_inc_var; + break; + case TCP_RACK_IDLE_REDUCE_HIGH: + optval = rack->r_idle_reduce_largest; + break; + case TCP_RACK_MIN_PACE: + optval = rack->r_enforce_min_pace; + break; + case TCP_RACK_MIN_PACE_SEG: + optval = rack->r_min_pace_seg_thresh; + break; + case TCP_BBR_RACK_RTT_USE: + optval = rack->r_ctl.rc_rate_sample_method; + break; + case TCP_DELACK: + optval = tp->t_delayed_ack; + break; + case TCP_DATA_AFTER_CLOSE: + optval = rack->rc_allow_data_af_clo; + break; + default: + return (tcp_default_ctloutput(so, sopt, inp, tp)); + break; + } + INP_WUNLOCK(inp); + error = sooptcopyout(sopt, &optval, sizeof optval); + return (error); +} + +static int +rack_ctloutput(struct socket *so, struct sockopt *sopt, struct inpcb *inp, struct tcpcb *tp) +{ + int32_t error = EINVAL; + struct tcp_rack *rack; + + rack = (struct tcp_rack *)tp->t_fb_ptr; + if (rack == NULL) { + /* Huh? */ + goto out; + } + if (sopt->sopt_dir == SOPT_SET) { + return (rack_set_sockopt(so, sopt, inp, tp, rack)); + } else if (sopt->sopt_dir == SOPT_GET) { + return (rack_get_sockopt(so, sopt, inp, tp, rack)); + } +out: + INP_WUNLOCK(inp); + return (error); +} + + +struct tcp_function_block __tcp_rack = { + .tfb_tcp_block_name = __XSTRING(STACKNAME), + .tfb_tcp_output = rack_output, + .tfb_tcp_do_segment = rack_do_segment, + .tfb_tcp_hpts_do_segment = rack_hpts_do_segment, + .tfb_tcp_ctloutput = rack_ctloutput, + .tfb_tcp_fb_init = rack_init, + .tfb_tcp_fb_fini = rack_fini, + .tfb_tcp_timer_stop_all = rack_stopall, + .tfb_tcp_timer_activate = rack_timer_activate, + .tfb_tcp_timer_active = rack_timer_active, + .tfb_tcp_timer_stop = rack_timer_stop, + .tfb_tcp_rexmit_tmr = rack_remxt_tmr, + .tfb_tcp_handoff_ok = rack_handoff_ok +}; + +static const char *rack_stack_names[] = { + __XSTRING(STACKNAME), +#ifdef STACKALIAS + __XSTRING(STACKALIAS), +#endif +}; + +static int +rack_ctor(void *mem, int32_t size, void *arg, int32_t how) +{ + memset(mem, 0, size); + return (0); +} + +static void +rack_dtor(void *mem, int32_t size, void *arg) +{ + +} + +static bool rack_mod_inited = false; + +static int +tcp_addrack(module_t mod, int32_t type, void *data) +{ + int32_t err = 0; + int num_stacks; + + switch (type) { + case MOD_LOAD: + rack_zone = uma_zcreate(__XSTRING(MODNAME) "_map", + sizeof(struct rack_sendmap), + rack_ctor, rack_dtor, NULL, NULL, UMA_ALIGN_PTR, 0); + + rack_pcb_zone = uma_zcreate(__XSTRING(MODNAME) "_pcb", + sizeof(struct tcp_rack), + rack_ctor, NULL, NULL, NULL, UMA_ALIGN_CACHE, 0); + + sysctl_ctx_init(&rack_sysctl_ctx); + rack_sysctl_root = SYSCTL_ADD_NODE(&rack_sysctl_ctx, + SYSCTL_STATIC_CHILDREN(_net_inet_tcp), + OID_AUTO, + __XSTRING(STACKNAME), + CTLFLAG_RW, 0, + ""); + if (rack_sysctl_root == NULL) { + printf("Failed to add sysctl node\n"); + err = EFAULT; + goto free_uma; + } + rack_init_sysctls(); + num_stacks = nitems(rack_stack_names); + err = register_tcp_functions_as_names(&__tcp_rack, M_WAITOK, + rack_stack_names, &num_stacks); + if (err) { + printf("Failed to register %s stack name for " + "%s module\n", rack_stack_names[num_stacks], + __XSTRING(MODNAME)); + sysctl_ctx_free(&rack_sysctl_ctx); +free_uma: + uma_zdestroy(rack_zone); + uma_zdestroy(rack_pcb_zone); + rack_counter_destroy(); + printf("Failed to register rack module -- err:%d\n", err); + return (err); + } + rack_mod_inited = true; + break; + case MOD_QUIESCE: + err = deregister_tcp_functions(&__tcp_rack, true, false); + break; + case MOD_UNLOAD: + err = deregister_tcp_functions(&__tcp_rack, false, true); + if (err == EBUSY) + break; + if (rack_mod_inited) { + uma_zdestroy(rack_zone); + uma_zdestroy(rack_pcb_zone); + sysctl_ctx_free(&rack_sysctl_ctx); + rack_counter_destroy(); + rack_mod_inited = false; + } + err = 0; + break; + default: + return (EOPNOTSUPP); + } + return (err); +} + +static moduledata_t tcp_rack = { + .name = __XSTRING(MODNAME), + .evhand = tcp_addrack, + .priv = 0 +}; + +MODULE_VERSION(MODNAME, 1); +DECLARE_MODULE(MODNAME, tcp_rack, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY); Property changes on: head/sys/netinet/tcp_stacks/rack.c ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Index: head/sys/netinet/tcp_stacks/rack_bbr_common.h =================================================================== --- head/sys/netinet/tcp_stacks/rack_bbr_common.h (nonexistent) +++ head/sys/netinet/tcp_stacks/rack_bbr_common.h (revision 334804) @@ -0,0 +1,70 @@ +#ifndef __pacer_timer_h__ +#define __pacer_timer_h__ +/*- + * Copyright (c) 2017 + * Netflix Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * __FBSDID("$FreeBSD$"); + */ +/* Common defines and such used by both RACK and BBR */ +/* Special values for mss accounting array */ +#define TCP_MSS_ACCT_JUSTRET 0 +#define TCP_MSS_ACCT_SNDACK 1 +#define TCP_MSS_ACCT_PERSIST 2 +#define TCP_MSS_ACCT_ATIMER 60 +#define TCP_MSS_ACCT_INPACE 61 +#define TCP_MSS_ACCT_LATE 62 +#define TCP_MSS_SMALL_SIZE_OFF 63 /* Point where small sizes enter */ +#define TCP_MSS_ACCT_SIZE 70 +#define TCP_MSS_SMALL_MAX_SIZE_DIV (TCP_MSS_ACCT_SIZE - TCP_MSS_SMALL_SIZE_OFF) + + +/* Magic flags to tell whats cooking on the pacing wheel */ +#define PACE_PKT_OUTPUT 0x01 /* Output Packets being paced */ +#define PACE_TMR_RACK 0x02 /* RACK timer running */ +#define PACE_TMR_TLP 0x04 /* TLP timer running */ +#define PACE_TMR_RXT 0x08 /* Retransmit timer running */ +#define PACE_TMR_PERSIT 0x10 /* Persists timer running */ +#define PACE_TMR_KEEP 0x20 /* Keep alive timer running */ +#define PACE_TMR_DELACK 0x40 /* Delayed ack timer running */ +#define PACE_TMR_MASK (PACE_TMR_KEEP|PACE_TMR_PERSIT|PACE_TMR_RXT|PACE_TMR_TLP|PACE_TMR_RACK|PACE_TMR_DELACK) + +/* Magic flags for tracing progress events */ +#define PROGRESS_DROP 1 +#define PROGRESS_UPDATE 2 +#define PROGRESS_CLEAR 3 +#define PROGRESS_START 4 + + +/* RTT sample methods */ +#define USE_RTT_HIGH 0 +#define USE_RTT_LOW 1 +#define USE_RTT_AVG 2 + +#ifdef _KERNEL +/* We have only 7 bits in rack so assert its true */ +CTASSERT((PACE_TMR_MASK & 0x80) == 0); +#endif +#endif Property changes on: head/sys/netinet/tcp_stacks/rack_bbr_common.h ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Index: head/sys/netinet/tcp_stacks/sack_filter.c =================================================================== --- head/sys/netinet/tcp_stacks/sack_filter.c (nonexistent) +++ head/sys/netinet/tcp_stacks/sack_filter.c (revision 334804) @@ -0,0 +1,706 @@ +/*- + * Copyright (c) 2017 + * Netflix Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + */ +#include +__FBSDID("$FreeBSD$"); +#include +#include +#include +#include +#include +#include +#include +#include +#ifndef _KERNEL +#include +#include +#include +#include +#include +#include +#include +#endif +#include "sack_filter.h" + +/* + * Sack filter is used to filter out sacks + * that have already been processed. The idea + * is pretty simple really, consider two sacks + * + * SACK 1 + * cum-ack A + * sack B - C + * SACK 2 + * cum-ack A + * sack D - E + * sack B - C + * + * The previous sack information (B-C) is repeated + * in SACK 2. If the receiver gets SACK 1 and then + * SACK 2 then any work associated with B-C as already + * been completed. This only effects where we may have + * (as in bbr or rack) cases where we walk a linked list. + * + * Now the utility trys to keep everything in a single + * cache line. This means that its not perfect and + * it could be that so big of sack's come that a + * "remembered" processed sack falls off the list and + * so gets re-processed. Thats ok, it just means we + * did some extra work. We could of course take more + * cache line hits by expanding the size of this + * structure, but then that would cost more. + */ + +#ifndef _KERNEL +int detailed_dump = 0; +uint64_t cnt_skipped_oldsack = 0; +uint64_t cnt_used_oldsack = 0; +int highest_used=0; +int over_written=0; +int empty_avail=0; +int no_collapse = 0; +FILE *out = NULL; +FILE *in = NULL; +#endif + +#define sack_blk_used(sf, i) ((1 << i) & sf->sf_bits) +#define sack_blk_set(sf, i) ((1 << i) | sf->sf_bits) +#define sack_blk_clr(sf, i) (~(1 << i) & sf->sf_bits) + +#ifndef _KERNEL +static +#endif +void +sack_filter_clear(struct sack_filter *sf, tcp_seq seq) +{ + sf->sf_ack = seq; + sf->sf_bits = 0; + sf->sf_cur = 0; + sf->sf_used = 0; +} +/* + * Given a previous sack filter block, filter out + * any entries where the cum-ack moves over them + * fully or partially. + */ +static void +sack_filter_prune(struct sack_filter *sf, tcp_seq th_ack) +{ + int32_t i; + /* start with the oldest */ + for (i = 0; i < SACK_FILTER_BLOCKS; i++) { + if (sack_blk_used(sf, i)) { + if (SEQ_GT(th_ack, sf->sf_blks[i].end)) { + /* This block is consumed */ + sf->sf_bits = sack_blk_clr(sf, i); + sf->sf_used--; + } else if (SEQ_GT(th_ack, sf->sf_blks[i].start)) { + /* Some of it is acked */ + sf->sf_blks[i].start = th_ack; + /* We could in theory break here, but + * there are some broken implementations + * that send multiple blocks. We want + * to catch them all with similar seq's. + */ + } + } + } + sf->sf_ack = th_ack; +} + +/* + * Return true if you find that + * the sackblock b is on the score + * board. Update it along the way + * if part of it is on the board. + */ +static int32_t +is_sack_on_board(struct sack_filter *sf, struct sackblk *b) +{ + int32_t i, cnt; + for (i = sf->sf_cur, cnt=0; cnt < SACK_FILTER_BLOCKS; cnt++) { + if (sack_blk_used(sf, i)) { + if (SEQ_LT(b->start, sf->sf_ack)) { + /* Behind cum-ack update */ + b->start = sf->sf_ack; + } + if (SEQ_LT(b->end, sf->sf_ack)) { + /* End back behind too */ + b->end = sf->sf_ack; + } + if (b->start == b->end) + return(1); + /* Jonathans Rule 1 */ + if (SEQ_LEQ(sf->sf_blks[i].start, b->start) && + SEQ_GEQ(sf->sf_blks[i].end, b->end)) { + /** + * Our board has this entirely in + * whole or in part: + * + * board |-------------| + * sack |-------------| + * + * board |-------------| + * sack |----| + * + */ + return(1); + } + /* Jonathans Rule 2 */ + if(SEQ_LT(sf->sf_blks[i].end, b->start)) { + /** + * Not near each other: + * + * board |---| + * sack |---| + */ + goto nxt_blk; + } + /* Jonathans Rule 3 */ + if (SEQ_GT(sf->sf_blks[i].start, b->end)) { + /** + * Not near each other: + * + * board |---| + * sack |---| + */ + goto nxt_blk; + } + if (SEQ_LEQ(sf->sf_blks[i].start, b->start)) { + /** + * The board block partial meets: + * + * board |--------| + * sack |----------| + * + * board |--------| + * sack |--------------| + * + * up with this one (we have part of it). + * 1) Update the board block to the new end + * and + * 2) Update the start of this block to my end. + */ + b->start = sf->sf_blks[i].end; + sf->sf_blks[i].end = b->end; + goto nxt_blk; + } + if (SEQ_GEQ(sf->sf_blks[i].end, b->end)) { + /** + * The board block partial meets: + * + * board |--------| + * sack |----------| + * + * board |----| + * sack |----------| + * 1) Update the board block to the new start + * and + * 2) Update the start of this block to my end. + */ + b->end = sf->sf_blks[i].start; + sf->sf_blks[i].start = b->start; + goto nxt_blk; + } + } + nxt_blk: + i++; + i %= SACK_FILTER_BLOCKS; + } + /* Did we totally consume it in pieces? */ + if (b->start != b->end) + return(0); + else + return(1); +} + +static int32_t +sack_filter_old(struct sack_filter *sf, struct sackblk *in, int numblks) +{ + int32_t num, i; + struct sackblk blkboard[TCP_MAX_SACK]; + /* + * An old sack has arrived. It may contain data + * we do not have. We might not have it since + * we could have had a lost ack we might have the + * entire thing on our current board. We want to prune + * off anything we have. With this function though we + * won't add to the board. + */ + for( i = 0, num = 0; isf_blks[i], &sf->sf_blks[idx], sizeof(struct sackblk)); + sf->sf_bits = sack_blk_clr(sf, idx); + sf->sf_bits = sack_blk_set(sf, i); + return; + } + i++; + i %= SACK_FILTER_BLOCKS; + } +} + +static int32_t +sack_filter_new(struct sack_filter *sf, struct sackblk *in, int numblks, tcp_seq th_ack) +{ + struct sackblk blkboard[TCP_MAX_SACK]; + int32_t num, i; + /* + * First lets trim the old and possibly + * throw any away we have. + */ + for(i=0, num=0; i=0; i--) { + if (is_sack_on_board(sf, &blkboard[i])) + continue; + /* Add this guy its not listed */ + sf->sf_cur++; + sf->sf_cur %= SACK_FILTER_BLOCKS; + if ((sack_blk_used(sf, sf->sf_cur)) && + (sf->sf_used < SACK_FILTER_BLOCKS)) { + sack_move_to_empty(sf, sf->sf_cur); + } +#ifndef _KERNEL + if (sack_blk_used(sf, sf->sf_cur)) { + over_written++; + if (sf->sf_used < SACK_FILTER_BLOCKS) + empty_avail++; + } +#endif + memcpy(&sf->sf_blks[sf->sf_cur], &in[i], sizeof(struct sackblk)); + if (sack_blk_used(sf, sf->sf_cur) == 0) { + sf->sf_used++; +#ifndef _KERNEL + if (sf->sf_used > highest_used) + highest_used = sf->sf_used; +#endif + sf->sf_bits = sack_blk_set(sf, sf->sf_cur); + } + } + return(numblks); +} + +/* + * Given a sack block on the board (the skip index) see if + * any other used entries overlap or meet, if so return the index. + */ +static int32_t +sack_blocks_overlap_or_meet(struct sack_filter *sf, struct sackblk *sb, uint32_t skip) +{ + int32_t i; + + for(i=0; isf_blks[i].end, sb->start) && + SEQ_LEQ(sf->sf_blks[i].end, sb->end) && + SEQ_LEQ(sf->sf_blks[i].start, sb->start)) { + /** + * The two board blocks meet: + * + * board1 |--------| + * board2 |----------| + * + * board1 |--------| + * board2 |--------------| + * + * board1 |--------| + * board2 |--------| + */ + return(i); + } + if (SEQ_LEQ(sf->sf_blks[i].start, sb->end) && + SEQ_GEQ(sf->sf_blks[i].start, sb->start) && + SEQ_GEQ(sf->sf_blks[i].end, sb->end)) { + /** + * The board block partial meets: + * + * board |--------| + * sack |----------| + * + * board |----| + * sack |----------| + * 1) Update the board block to the new start + * and + * 2) Update the start of this block to my end. + */ + return(i); + } + } + return (-1); +} + +/* + * Collapse entry src into entry into + * and free up the src entry afterwards. + */ +static void +sack_collapse(struct sack_filter *sf, int32_t src, int32_t into) +{ + if (SEQ_LT(sf->sf_blks[src].start, sf->sf_blks[into].start)) { + /* src has a lower starting point */ + sf->sf_blks[into].start = sf->sf_blks[src].start; + } + if (SEQ_GT(sf->sf_blks[src].end, sf->sf_blks[into].end)) { + /* src has a higher ending point */ + sf->sf_blks[into].end = sf->sf_blks[src].end; + } + sf->sf_bits = sack_blk_clr(sf, src); + sf->sf_used--; +} + +static void +sack_board_collapse(struct sack_filter *sf) +{ + int32_t i, j, i_d, j_d; + + for(i=0; isf_blks[i], i); + if (j == -1) { + /* No overlap */ + continue; + } + /* + * Ok j and i overlap with each other, collapse the + * one out furthest away from the current position. + */ + if (sf->sf_cur > i) + i_d = sf->sf_cur - i; + else + i_d = i - sf->sf_cur; + if (sf->sf_cur > j) + j_d = sf->sf_cur - j; + else + j_d = j - sf->sf_cur; + if (j_d > i_d) { + sack_collapse(sf, j, i); + } else + sack_collapse(sf, i, j); + } +} + +#ifndef _KERNEL +static +#endif +int +sack_filter_blks(struct sack_filter *sf, struct sackblk *in, int numblks, tcp_seq th_ack) +{ + int32_t i, ret; + + if (numblks > TCP_MAX_SACK) { + panic("sf:%p sb:%p Impossible number of sack blocks %d > 4\n", + sf, in, + numblks); + return(numblks); + } + if ((sf->sf_used == 0) && numblks) { + /* + * We are brand new add the blocks in + * reverse order. Note we can see more + * than one in new, since ack's could be lost. + */ + sf->sf_ack = th_ack; + for(i=(numblks-1), sf->sf_cur=0; i >= 0; i--) { + memcpy(&sf->sf_blks[sf->sf_cur], &in[i], sizeof(struct sackblk)); + sf->sf_bits = sack_blk_set(sf, sf->sf_cur); + sf->sf_cur++; + sf->sf_cur %= SACK_FILTER_BLOCKS; + sf->sf_used++; +#ifndef _KERNEL + if (sf->sf_used > highest_used) + highest_used = sf->sf_used; +#endif + } + if (sf->sf_cur) + sf->sf_cur--; + return(numblks); + } + if (SEQ_GT(th_ack, sf->sf_ack)) { + sack_filter_prune(sf, th_ack); + } + if (numblks) { + if (SEQ_GEQ(th_ack, sf->sf_ack)) { + ret = sack_filter_new(sf, in, numblks, th_ack); + } else { + ret = sack_filter_old(sf, in, numblks); + } + } else + ret = 0; +#ifndef _KERNEL + if ((sf->sf_used > 1) && (no_collapse == 0)) + sack_board_collapse(sf); + +#else + if (sf->sf_used > 1) + sack_board_collapse(sf); + +#endif + return (ret); +} + +#ifndef _KERNEL +uint64_t saved=0; +uint64_t tot_sack_blks=0; + +static void +sack_filter_dump(FILE *out, struct sack_filter *sf) +{ + int i; + fprintf(out, " sf_ack:%u sf_bits:0x%x c:%d used:%d\n", + sf->sf_ack, sf->sf_bits, + sf->sf_cur, sf->sf_used); + + for(i=0; isf_blks[i].start, + sf->sf_blks[i].end); + } + } +} + +int +main(int argc, char **argv) +{ + char buffer[512]; + struct sackblk blks[TCP_MAX_SACK]; + FILE *err; + tcp_seq th_ack, snd_una; + struct sack_filter sf; + int32_t numblks,i; + int snd_una_set=0; + double a, b, c; + int invalid_sack_print = 0; + uint32_t chg_remembered=0; + uint32_t sack_chg=0; + char line_buf[10][256]; + int line_buf_at=0; + + in = stdin; + out = stdout; + while ((i = getopt(argc, argv, "ndIi:o:?h")) != -1) { + switch (i) { + case 'n': + no_collapse = 1; + break; + case 'd': + detailed_dump = 1; + break; + case'I': + invalid_sack_print = 1; + break; + case 'i': + in = fopen(optarg, "r"); + if (in == NULL) { + fprintf(stderr, "Fatal error can't open %s for input\n", optarg); + exit(-1); + } + break; + case 'o': + out = fopen(optarg, "w"); + if (out == NULL) { + fprintf(stderr, "Fatal error can't open %s for output\n", optarg); + exit(-1); + } + break; + default: + case '?': + case 'h': + fprintf(stderr, "Use %s [ -i infile -o outfile -I]\n", argv[0]); + return(0); + break; + }; + } + sack_filter_clear(&sf, 0); + memset(buffer, 0, sizeof(buffer)); + memset(blks, 0, sizeof(blks)); + numblks = 0; + fprintf(out, "************************************\n"); + while (fgets(buffer, sizeof(buffer), in) != NULL) { + sprintf(line_buf[line_buf_at], "%s", buffer); + line_buf_at++; + if (strncmp(buffer, "QUIT", 4) == 0) { + break; + } else if (strncmp(buffer, "DONE", 4) == 0) { + int nn, ii; + if (numblks) { + uint32_t szof, tot_chg; + for(ii=0; ii chg_remembered)){ + fprintf(out,"***WARNING WILL RODGERS DANGER!! sack_chg:%u last:%u\n", + sack_chg, chg_remembered + ); + } + sack_chg = chg_remembered = 0; + } else if (strncmp(buffer, "RXT", 3) == 0) { + sack_filter_clear(&sf, snd_una); + } else if (strncmp(buffer, "ACK:", 4) == 0) { + th_ack = strtoul(&buffer[4], NULL, 0); + if (snd_una_set == 0) { + snd_una = th_ack; + snd_una_set = 1; + } else if (SEQ_GT(th_ack, snd_una)) { + snd_una = th_ack; + } + } else if (strncmp(buffer, "EXIT", 4) == 0) { + sack_filter_clear(&sf, snd_una); + sack_chg = chg_remembered = 0; + } else if (strncmp(buffer, "SACK:", 5) == 0) { + char *end=NULL; + uint32_t start; + uint32_t endv; + start = strtoul(&buffer[5], &end, 0); + if (end) { + endv = strtoul(&end[1], NULL, 0); + } else { + fprintf(out, "--Sack invalid skip 0 start:%u : ??\n", start); + continue; + } + if (SEQ_LT(endv, start)) { + fprintf(out, "--Sack invalid skip 1 endv:%u < start:%u\n", endv, start); + continue; + } + if (numblks == TCP_MAX_SACK) { + fprintf(out, "--Exceeded max %d\n", numblks); + exit(0); + } + blks[numblks].start = start; + blks[numblks].end = endv; + numblks++; + } + memset(buffer, 0, sizeof(buffer)); + } + if (in != stdin) { + fclose(in); + } + if (out != stdout) { + fclose(out); + } + a = saved * 100.0; + b = tot_sack_blks * 1.0; + if (b > 0.0) + c = a/b; + else + c = 0.0; + if (out != stdout) + err = stdout; + else + err = stderr; + fprintf(err, "Saved %lu sack blocks out of %lu (%2.3f%%) old_skip:%lu old_usd:%lu high_cnt:%d ow:%d ea:%d\n", + saved, tot_sack_blks, c, cnt_skipped_oldsack, cnt_used_oldsack, highest_used, over_written, empty_avail); + return(0); +} +#endif Property changes on: head/sys/netinet/tcp_stacks/sack_filter.c ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Index: head/sys/netinet/tcp_stacks/sack_filter.h =================================================================== --- head/sys/netinet/tcp_stacks/sack_filter.h (nonexistent) +++ head/sys/netinet/tcp_stacks/sack_filter.h (revision 334804) @@ -0,0 +1,58 @@ +#ifndef __sack_filter_h__ +#define __sack_filter_h__ +/*- + * Copyright (c) 2017 + * Netflix Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * __FBSDID("$FreeBSD$"); + */ + +/* + * Seven entry's is carefully choosen to + * fit in one cache line. We can easily + * change this to 15 (but it gets very + * little extra filtering). To change it + * to be larger than 15 would require either + * sf_bits becoming a uint32_t and then you + * could go to 31.. or change it to a full + * bitstring.. It is really doubtful you + * will get much benefit beyond 7, in testing + * there was a small amount but very very small. + */ +#define SACK_FILTER_BLOCKS 7 + +struct sack_filter { + tcp_seq sf_ack; + uint16_t sf_bits; + uint8_t sf_cur; + uint8_t sf_used; + struct sackblk sf_blks[SACK_FILTER_BLOCKS]; +}; +#ifdef _KERNEL +void sack_filter_clear(struct sack_filter *sf, tcp_seq seq); +int sack_filter_blks(struct sack_filter *sf, struct sackblk *in, int numblks, tcp_seq th_ack); + +#endif +#endif Property changes on: head/sys/netinet/tcp_stacks/sack_filter.h ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Index: head/sys/netinet/tcp_stacks/tcp_rack.h =================================================================== --- head/sys/netinet/tcp_stacks/tcp_rack.h (nonexistent) +++ head/sys/netinet/tcp_stacks/tcp_rack.h (revision 334804) @@ -0,0 +1,321 @@ +/*- + * Copyright (c) 2016 + * Netflix Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _NETINET_TCP_RACK_H_ +#define _NETINET_TCP_RACK_H_ + +#define RACK_ACKED 0x0001/* The remote endpoint acked this */ +#define RACK_TO_MIXED 0x0002/* A timeout occured that mixed the send order */ +#define RACK_DEFERRED 0x0004/* We can't use this for RTT calc */ +#define RACK_OVERMAX 0x0008/* We have more retran's then we can fit */ +#define RACK_SACK_PASSED 0x0010/* A sack was done above this block */ +#define RACK_WAS_SACKPASS 0x0020/* We retransmitted due to SACK pass */ +#define RACK_HAS_FIN 0x0040/* segment is sent with fin */ +#define RACK_TLP 0x0080/* segment sent as tail-loss-probe */ + +#define RACK_NUM_OF_RETRANS 3 + +#define RACK_INITIAL_RTO 1000 /* 1 second in milli seconds */ + +struct rack_sendmap { + TAILQ_ENTRY(rack_sendmap) r_next; /* seq number arrayed next */ + TAILQ_ENTRY(rack_sendmap) r_tnext; /* Time of transmit based next */ + uint32_t r_tim_lastsent[RACK_NUM_OF_RETRANS]; + uint32_t r_start; /* Sequence number of the segment */ + uint32_t r_end; /* End seq, this is 1 beyond actually */ + uint32_t r_rtr_bytes; /* How many bytes have been retransmitted */ + uint16_t r_rtr_cnt; /* Retran count, index this -1 to get time + * sent */ + uint8_t r_flags; /* Flags as defined above */ + uint8_t r_sndcnt; /* Retran count, not limited by + * RACK_NUM_OF_RETRANS */ + uint8_t r_in_tmap; /* Flag to see if its in the r_tnext array */ + uint8_t r_resv[3]; +}; + +TAILQ_HEAD(rack_head, rack_sendmap); + + +/* + * We use the rate sample structure to + * assist in single sack/ack rate and rtt + * calculation. In the future we will expand + * this in BBR to do forward rate sample + * b/w estimation. + */ +#define RACK_RTT_EMPTY 0x00000001 /* Nothing yet stored in RTT's */ +#define RACK_RTT_VALID 0x00000002 /* We have at least one valid RTT */ +struct rack_rtt_sample { + uint32_t rs_flags; + uint32_t rs_rtt_lowest; + uint32_t rs_rtt_highest; + uint32_t rs_rtt_cnt; + uint64_t rs_rtt_tot; +}; + +#define RACK_LOG_TYPE_ACK 0x01 +#define RACK_LOG_TYPE_OUT 0x02 +#define RACK_LOG_TYPE_TO 0x03 +#define RACK_LOG_TYPE_ALLOC 0x04 +#define RACK_LOG_TYPE_FREE 0x05 + + +struct rack_log { + union { + struct rack_sendmap *rsm; /* For alloc/free */ + uint64_t sb_acc;/* For out/ack or t-o */ + }; + uint32_t th_seq; + uint32_t th_ack; + uint32_t snd_una; + uint32_t snd_nxt; /* th_win for TYPE_ACK */ + uint32_t snd_max; + uint32_t blk_start[4]; + uint32_t blk_end[4]; + uint8_t type; + uint8_t n_sackblks; + uint16_t len; /* Timeout T3=1, TLP=2, RACK=3 */ +}; + +/* + * Magic numbers for logging timeout events if the + * logging is enabled. + */ +#define RACK_TO_FRM_TMR 1 +#define RACK_TO_FRM_TLP 2 +#define RACK_TO_FRM_RACK 3 +#define RACK_TO_FRM_KEEP 4 +#define RACK_TO_FRM_PERSIST 5 +#define RACK_TO_FRM_DELACK 6 + +struct rack_opts_stats { + uint64_t tcp_rack_prop_rate; + uint64_t tcp_rack_prop; + uint64_t tcp_rack_tlp_reduce; + uint64_t tcp_rack_early_recov; + uint64_t tcp_rack_pace_always; + uint64_t tcp_rack_pace_reduce; + uint64_t tcp_rack_max_seg; + uint64_t tcp_rack_prr_sendalot; + uint64_t tcp_rack_min_to; + uint64_t tcp_rack_early_seg; + uint64_t tcp_rack_reord_thresh; + uint64_t tcp_rack_reord_fade; + uint64_t tcp_rack_tlp_thresh; + uint64_t tcp_rack_pkt_delay; + uint64_t tcp_rack_tlp_inc_var; + uint64_t tcp_tlp_use; + uint64_t tcp_rack_idle_reduce; + uint64_t tcp_rack_idle_reduce_high; + uint64_t rack_no_timer_in_hpts; + uint64_t tcp_rack_min_pace_seg; + uint64_t tcp_rack_min_pace; +}; + +#define TLP_USE_ID 1 /* Internet draft behavior */ +#define TLP_USE_TWO_ONE 2 /* Use 2.1 behavior */ +#define TLP_USE_TWO_TWO 3 /* Use 2.2 behavior */ + +#ifdef _KERNEL +#define RACK_OPTS_SIZE (sizeof(struct rack_opts_stats)/sizeof(uint64_t)) +extern counter_u64_t rack_opts_arry[RACK_OPTS_SIZE]; +#define RACK_OPTS_ADD(name, amm) counter_u64_add(rack_opts_arry[(offsetof(struct rack_opts_stats, name)/sizeof(uint64_t))], (amm)) +#define RACK_OPTS_INC(name) RACK_OPTS_ADD(name, 1) +#endif +/* + * As we get each SACK we wade through the + * rc_map and mark off what is acked. + * We also increment rc_sacked as well. + * + * We also pay attention to missing entries + * based on the time and possibly mark them + * for retransmit. If we do and we are not already + * in recovery we enter recovery. In doing + * so we claer prr_delivered/holes_rxt and prr_sent_dur_rec. + * We also setup rc_next/rc_snd_nxt/rc_send_end so + * we will know where to send from. When not in + * recovery rc_next will be NULL and rc_snd_nxt should + * equal snd_max. + * + * Whenever we retransmit from recovery we increment + * rc_holes_rxt as we retran a block and mark it as retransmitted + * with the time it was sent. During non-recovery sending we + * add to our map and note the time down of any send expanding + * the rc_map at the tail and moving rc_snd_nxt up with snd_max. + * + * In recovery during SACK/ACK processing if a chunk has + * been retransmitted and it is now acked, we decrement rc_holes_rxt. + * When we retransmit from the scoreboard we use + * rc_next and rc_snd_nxt/rc_send_end to help us + * find what needs to be retran. + * + * To calculate pipe we simply take (snd_max - snd_una) + rc_holes_rxt + * This gets us the effect of RFC6675 pipe, counting twice for + * bytes retransmitted. + */ + +#define TT_RACK_FR_TMR 0x2000 + +/* + * Locking for the rack control block. + * a) Locked by INP_WLOCK + * b) Locked by the hpts-mutex + * + */ + +struct rack_control { + /* Second cache line 0x40 from tcp_rack */ + struct rack_head rc_map;/* List of all segments Lock(a) */ + struct rack_head rc_tmap; /* List in transmit order Lock(a) */ + struct rack_sendmap *rc_tlpsend; /* Remembered place for + * tlp_sending Lock(a) */ + struct rack_sendmap *rc_resend; /* something we have been asked to + * resend */ + uint32_t rc_hpts_flags; + uint32_t rc_timer_exp; /* If a timer ticks of expiry */ + uint32_t rc_rack_min_rtt; /* lowest RTT seen Lock(a) */ + uint32_t rc_rack_largest_cwnd; /* Largest CWND we have seen Lock(a) */ + + /* Third Cache line 0x80 */ + struct rack_head rc_free; /* Allocation array */ + uint32_t rc_time_last_sent; /* Time we last sent some data and + * logged it Lock(a). */ + uint32_t rc_reorder_ts; /* Last time we saw reordering Lock(a) */ + + uint32_t rc_tlp_new_data; /* we need to send new-data on a TLP + * Lock(a) */ + uint32_t rc_prr_out; /* bytes sent during recovery Lock(a) */ + + uint32_t rc_prr_recovery_fs; /* recovery fs point Lock(a) */ + + uint32_t rc_prr_sndcnt; /* Prr sndcnt Lock(a) */ + + uint32_t rc_sacked; /* Tot sacked on scoreboard Lock(a) */ + uint32_t rc_last_tlp_seq; /* Last tlp sequence Lock(a) */ + + uint32_t rc_prr_delivered; /* during recovery prr var Lock(a) */ + uint16_t rc_tlp_send_cnt; /* Number of TLP sends we have done + * since peer spoke to us Lock(a) */ + uint16_t rc_tlp_seg_send_cnt; /* Number of times we have TLP sent + * rc_last_tlp_seq Lock(a) */ + + uint32_t rc_loss_count; /* During recovery how many segments were lost + * Lock(a) */ + uint32_t rc_reorder_fade; /* Socket option value Lock(a) */ + + /* Forth cache line 0xc0 */ + /* Times */ + + uint32_t rc_rack_tmit_time; /* Rack transmit time Lock(a) */ + uint32_t rc_holes_rxt; /* Tot retraned from scoreboard Lock(a) */ + + /* Variables to track bad retransmits and recover */ + uint32_t rc_rsm_start; /* RSM seq number we retransmitted Lock(a) */ + uint32_t rc_cwnd_at; /* cwnd at the retransmit Lock(a) */ + + uint32_t rc_ssthresh_at;/* ssthresh at the retransmit Lock(a) */ + uint32_t rc_num_maps_alloced; /* Number of map blocks (sacks) we + * have allocated */ + uint32_t rc_rcvtime; /* When we last received data */ + uint32_t rc_notused; + uint32_t rc_last_output_to; + uint32_t rc_went_idle_time; + + struct rack_sendmap *rc_sacklast; /* sack remembered place + * Lock(a) */ + + struct rack_sendmap *rc_next; /* remembered place where we next + * retransmit at Lock(a) */ + struct rack_sendmap *rc_rsm_at_retran; /* Debug variable kept for + * cache line alignment + * Lock(a) */ + /* Cache line split 0x100 */ + struct sack_filter rack_sf; + /* Cache line split 0x140 */ + /* Flags for various things */ + struct rack_rtt_sample rack_rs; + uint32_t rc_tlp_threshold; /* Socket option value Lock(a) */ + uint16_t rc_early_recovery_segs; /* Socket option value Lock(a) */ + uint16_t rc_reorder_shift; /* Socket option value Lock(a) */ + uint16_t rc_pkt_delay; /* Socket option value Lock(a) */ + uint8_t rc_prop_rate; /* Socket option value Lock(a) */ + uint8_t rc_prop_reduce; /* Socket option value Lock(a) */ + uint8_t rc_tlp_cwnd_reduce; /* Socket option value Lock(a) */ + uint8_t rc_early_recovery; /* Socket option value Lock(a) */ + uint8_t rc_prr_sendalot;/* Socket option value Lock(a) */ + uint8_t rc_min_to; /* Socket option value Lock(a) */ + uint8_t rc_prr_inc_var; /* Socket option value Lock(a) */ + uint8_t rc_tlp_rtx_out; /* This is TLPRtxOut in the draft */ + uint8_t rc_rate_sample_method; +}; + +#ifdef _KERNEL + +struct tcp_rack { + /* First cache line 0x00 */ + TAILQ_ENTRY(tcp_rack) r_hpts; /* hptsi queue next Lock(b) */ + int32_t(*r_substate) (struct mbuf *, struct tcphdr *, + struct socket *, struct tcpcb *, struct tcpopt *, + int32_t, int32_t, int32_t *, uint32_t, int, int); /* Lock(a) */ + struct tcpcb *rc_tp; /* The tcpcb Lock(a) */ + struct inpcb *rc_inp; /* The inpcb Lock(a) */ + uint32_t rc_free_cnt; /* Number of free entries on the rc_free list + * Lock(a) */ + uint32_t rc_rack_rtt; /* RACK-RTT Lock(a) */ + uint16_t r_wanted_output; /* Output routine wanted to be called */ + uint16_t r_cpu; /* CPU that the INP is running on Lock(a) */ + uint16_t rc_pace_max_segs; /* Socket option value Lock(a) */ + uint16_t rc_pace_reduce;/* Socket option value Lock(a) */ + + uint8_t r_state; /* Current rack state Lock(a) */ + uint8_t rc_tmr_stopped : 7, + t_timers_stopped : 1; + uint8_t rc_enobuf; /* count of enobufs on connection provides + * backoff Lock(a) */ + uint8_t r_timer_override : 1, /* hpts override Lock(a) */ + r_tlp_running : 1, /* Running from a TLP timeout Lock(a) */ + r_is_v6 : 1, /* V6 pcb Lock(a) */ + rc_in_persist : 1, + rc_last_pto_set : 1, /* XXX not used */ + rc_tlp_in_progress : 1, + rc_always_pace : 1, /* Socket option value Lock(a) */ + rc_timer_up : 1; /* The rack timer is up flag Lock(a) */ + uint8_t r_idle_reduce_largest : 1, + r_enforce_min_pace : 2, + r_min_pace_seg_thresh : 5; + uint8_t rack_tlp_threshold_use; + uint8_t rc_allow_data_af_clo: 1, + delayed_ack : 1, + rc_avail : 6; + uint8_t r_resv[2]; /* Fill to cache line boundary */ + /* Cache line 2 0x40 */ + struct rack_control r_ctl; +} __aligned(CACHE_LINE_SIZE); + +#endif +#endif Property changes on: head/sys/netinet/tcp_stacks/tcp_rack.h ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Index: head/sys/netinet/tcp_timer.c =================================================================== --- head/sys/netinet/tcp_timer.c (revision 334803) +++ head/sys/netinet/tcp_timer.c (revision 334804) @@ -1,993 +1,1094 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)tcp_timer.c 8.2 (Berkeley) 5/24/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_tcpdebug.h" #include "opt_rss.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #endif #include #include #include #include #include #include +#include #include #ifdef INET6 #include #endif #include #ifdef TCPDEBUG #include #endif int tcp_persmin; SYSCTL_PROC(_net_inet_tcp, OID_AUTO, persmin, CTLTYPE_INT|CTLFLAG_RW, &tcp_persmin, 0, sysctl_msec_to_ticks, "I", "minimum persistence interval"); int tcp_persmax; SYSCTL_PROC(_net_inet_tcp, OID_AUTO, persmax, CTLTYPE_INT|CTLFLAG_RW, &tcp_persmax, 0, sysctl_msec_to_ticks, "I", "maximum persistence interval"); int tcp_keepinit; SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINIT, keepinit, CTLTYPE_INT|CTLFLAG_RW, &tcp_keepinit, 0, sysctl_msec_to_ticks, "I", "time to establish connection"); int tcp_keepidle; SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPIDLE, keepidle, CTLTYPE_INT|CTLFLAG_RW, &tcp_keepidle, 0, sysctl_msec_to_ticks, "I", "time before keepalive probes begin"); int tcp_keepintvl; SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINTVL, keepintvl, CTLTYPE_INT|CTLFLAG_RW, &tcp_keepintvl, 0, sysctl_msec_to_ticks, "I", "time between keepalive probes"); int tcp_delacktime; SYSCTL_PROC(_net_inet_tcp, TCPCTL_DELACKTIME, delacktime, CTLTYPE_INT|CTLFLAG_RW, &tcp_delacktime, 0, sysctl_msec_to_ticks, "I", "Time before a delayed ACK is sent"); int tcp_msl; SYSCTL_PROC(_net_inet_tcp, OID_AUTO, msl, CTLTYPE_INT|CTLFLAG_RW, &tcp_msl, 0, sysctl_msec_to_ticks, "I", "Maximum segment lifetime"); int tcp_rexmit_min; SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_min, CTLTYPE_INT|CTLFLAG_RW, &tcp_rexmit_min, 0, sysctl_msec_to_ticks, "I", "Minimum Retransmission Timeout"); int tcp_rexmit_slop; SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_slop, CTLTYPE_INT|CTLFLAG_RW, &tcp_rexmit_slop, 0, sysctl_msec_to_ticks, "I", "Retransmission Timer Slop"); int tcp_always_keepalive = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, always_keepalive, CTLFLAG_RW, &tcp_always_keepalive , 0, "Assume SO_KEEPALIVE on all TCP connections"); int tcp_fast_finwait2_recycle = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, fast_finwait2_recycle, CTLFLAG_RW, &tcp_fast_finwait2_recycle, 0, "Recycle closed FIN_WAIT_2 connections faster"); int tcp_finwait2_timeout; SYSCTL_PROC(_net_inet_tcp, OID_AUTO, finwait2_timeout, CTLTYPE_INT|CTLFLAG_RW, &tcp_finwait2_timeout, 0, sysctl_msec_to_ticks, "I", "FIN-WAIT2 timeout"); int tcp_keepcnt = TCPTV_KEEPCNT; SYSCTL_INT(_net_inet_tcp, OID_AUTO, keepcnt, CTLFLAG_RW, &tcp_keepcnt, 0, "Number of keepalive probes to send"); /* max idle probes */ int tcp_maxpersistidle; -static int tcp_rexmit_drop_options = 0; +int tcp_rexmit_drop_options = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, rexmit_drop_options, CTLFLAG_RW, &tcp_rexmit_drop_options, 0, "Drop TCP options from 3rd and later retransmitted SYN"); VNET_DEFINE(int, tcp_pmtud_blackhole_detect); SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_detection, CTLFLAG_RW|CTLFLAG_VNET, &VNET_NAME(tcp_pmtud_blackhole_detect), 0, "Path MTU Discovery Black Hole Detection Enabled"); #ifdef INET VNET_DEFINE(int, tcp_pmtud_blackhole_mss) = 1200; SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_mss, CTLFLAG_RW|CTLFLAG_VNET, &VNET_NAME(tcp_pmtud_blackhole_mss), 0, "Path MTU Discovery Black Hole Detection lowered MSS"); #endif #ifdef INET6 VNET_DEFINE(int, tcp_v6pmtud_blackhole_mss) = 1220; SYSCTL_INT(_net_inet_tcp, OID_AUTO, v6pmtud_blackhole_mss, CTLFLAG_RW|CTLFLAG_VNET, &VNET_NAME(tcp_v6pmtud_blackhole_mss), 0, "Path MTU Discovery IPv6 Black Hole Detection lowered MSS"); #endif #ifdef RSS static int per_cpu_timers = 1; #else static int per_cpu_timers = 0; #endif SYSCTL_INT(_net_inet_tcp, OID_AUTO, per_cpu_timers, CTLFLAG_RW, &per_cpu_timers , 0, "run tcp timers on all cpus"); -#if 0 -#define INP_CPU(inp) (per_cpu_timers ? (!CPU_ABSENT(((inp)->inp_flowid % (mp_maxid+1))) ? \ - ((inp)->inp_flowid % (mp_maxid+1)) : curcpu) : 0) -#endif - /* * Map the given inp to a CPU id. * * This queries RSS if it's compiled in, else it defaults to the current * CPU ID. */ -static inline int +inline int inp_to_cpuid(struct inpcb *inp) { u_int cpuid; #ifdef RSS if (per_cpu_timers) { cpuid = rss_hash2cpuid(inp->inp_flowid, inp->inp_flowtype); if (cpuid == NETISR_CPUID_NONE) return (curcpu); /* XXX */ else return (cpuid); } #else /* Legacy, pre-RSS behaviour */ if (per_cpu_timers) { /* * We don't have a flowid -> cpuid mapping, so cheat and * just map unknown cpuids to curcpu. Not the best, but * apparently better than defaulting to swi 0. */ cpuid = inp->inp_flowid % (mp_maxid + 1); if (! CPU_ABSENT(cpuid)) return (cpuid); return (curcpu); } #endif /* Default for RSS and non-RSS - cpuid 0 */ else { return (0); } } /* * Tcp protocol timeout routine called every 500 ms. * Updates timestamps used for TCP * causes finite state machine actions if timers expire. */ void tcp_slowtimo(void) { VNET_ITERATOR_DECL(vnet_iter); VNET_LIST_RLOCK_NOSLEEP(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); (void) tcp_tw_2msl_scan(0); CURVNET_RESTORE(); } VNET_LIST_RUNLOCK_NOSLEEP(); } int tcp_syn_backoff[TCP_MAXRXTSHIFT + 1] = { 1, 1, 1, 1, 1, 2, 4, 8, 16, 32, 64, 64, 64 }; int tcp_backoff[TCP_MAXRXTSHIFT + 1] = { 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 512, 512, 512 }; -static int tcp_totbackoff = 2559; /* sum of tcp_backoff[] */ +int tcp_totbackoff = 2559; /* sum of tcp_backoff[] */ /* * TCP timer processing. */ void tcp_timer_delack(void *xtp) { struct tcpcb *tp = xtp; struct inpcb *inp; CURVNET_SET(tp->t_vnet); inp = tp->t_inpcb; KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); INP_WLOCK(inp); if (callout_pending(&tp->t_timers->tt_delack) || !callout_active(&tp->t_timers->tt_delack)) { INP_WUNLOCK(inp); CURVNET_RESTORE(); return; } callout_deactivate(&tp->t_timers->tt_delack); if ((inp->inp_flags & INP_DROPPED) != 0) { INP_WUNLOCK(inp); CURVNET_RESTORE(); return; } tp->t_flags |= TF_ACKNOW; TCPSTAT_INC(tcps_delack); (void) tp->t_fb->tfb_tcp_output(tp); INP_WUNLOCK(inp); CURVNET_RESTORE(); } /* * When a timer wants to remove a TCB it must * hold the INP_INFO_RLOCK(). The timer function * should only have grabbed the INP_WLOCK() when * it entered. To safely switch to holding both the * INP_INFO_RLOCK() and the INP_WLOCK() we must first * grab a reference on the inp, which will hold the inp * so that it can't be removed. We then unlock the INP_WLOCK(), * and grab the INP_INFO_RLOCK() lock. Once we have the INP_INFO_RLOCK() * we proceed again to get the INP_WLOCK() (this preserves proper * lock order). After acquiring the INP_WLOCK we must check if someone * else deleted the pcb i.e. the inp_flags check. * If so we return 1 otherwise we return 0. * * No matter what the tcp_inpinfo_lock_add() function * returns the caller must afterwards call tcp_inpinfo_lock_del() * to drop the locks and reference properly. */ int tcp_inpinfo_lock_add(struct inpcb *inp) { in_pcbref(inp); INP_WUNLOCK(inp); INP_INFO_RLOCK(&V_tcbinfo); INP_WLOCK(inp); if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { return(1); } return(0); } void tcp_inpinfo_lock_del(struct inpcb *inp, struct tcpcb *tp) { INP_INFO_RUNLOCK(&V_tcbinfo); if (inp && (tp == NULL)) { /* * If tcp_close/drop() gets called and tp * returns NULL, then the function dropped * the inp lock, we hold a reference keeping * this around, so we must re-aquire the * INP_WLOCK() in order to proceed with * our dropping the inp reference. */ INP_WLOCK(inp); } if (inp && in_pcbrele_wlocked(inp) == 0) INP_WUNLOCK(inp); } void tcp_timer_2msl(void *xtp) { struct tcpcb *tp = xtp; struct inpcb *inp; CURVNET_SET(tp->t_vnet); #ifdef TCPDEBUG int ostate; ostate = tp->t_state; #endif inp = tp->t_inpcb; KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); INP_WLOCK(inp); tcp_free_sackholes(tp); if (callout_pending(&tp->t_timers->tt_2msl) || !callout_active(&tp->t_timers->tt_2msl)) { INP_WUNLOCK(tp->t_inpcb); CURVNET_RESTORE(); return; } callout_deactivate(&tp->t_timers->tt_2msl); if ((inp->inp_flags & INP_DROPPED) != 0) { INP_WUNLOCK(inp); CURVNET_RESTORE(); return; } KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0, ("%s: tp %p tcpcb can't be stopped here", __func__, tp)); /* * 2 MSL timeout in shutdown went off. If we're closed but * still waiting for peer to close and connection has been idle * too long delete connection control block. Otherwise, check * again in a bit. * * If in TIME_WAIT state just ignore as this timeout is handled in * tcp_tw_2msl_scan(). * * If fastrecycle of FIN_WAIT_2, in FIN_WAIT_2 and receiver has closed, * there's no point in hanging onto FIN_WAIT_2 socket. Just close it. * Ignore fact that there were recent incoming segments. */ if ((inp->inp_flags & INP_TIMEWAIT) != 0) { INP_WUNLOCK(inp); CURVNET_RESTORE(); return; } if (tcp_fast_finwait2_recycle && tp->t_state == TCPS_FIN_WAIT_2 && tp->t_inpcb && tp->t_inpcb->inp_socket && (tp->t_inpcb->inp_socket->so_rcv.sb_state & SBS_CANTRCVMORE)) { TCPSTAT_INC(tcps_finwait2_drops); if (tcp_inpinfo_lock_add(inp)) { tcp_inpinfo_lock_del(inp, tp); goto out; } tp = tcp_close(tp); tcp_inpinfo_lock_del(inp, tp); goto out; } else { if (ticks - tp->t_rcvtime <= TP_MAXIDLE(tp)) { callout_reset(&tp->t_timers->tt_2msl, TP_KEEPINTVL(tp), tcp_timer_2msl, tp); } else { if (tcp_inpinfo_lock_add(inp)) { tcp_inpinfo_lock_del(inp, tp); goto out; } tp = tcp_close(tp); tcp_inpinfo_lock_del(inp, tp); goto out; } } #ifdef TCPDEBUG if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, PRU_SLOWTIMO); #endif TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); if (tp != NULL) INP_WUNLOCK(inp); out: CURVNET_RESTORE(); } void tcp_timer_keep(void *xtp) { struct tcpcb *tp = xtp; struct tcptemp *t_template; struct inpcb *inp; CURVNET_SET(tp->t_vnet); #ifdef TCPDEBUG int ostate; ostate = tp->t_state; #endif inp = tp->t_inpcb; KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); INP_WLOCK(inp); if (callout_pending(&tp->t_timers->tt_keep) || !callout_active(&tp->t_timers->tt_keep)) { INP_WUNLOCK(inp); CURVNET_RESTORE(); return; } callout_deactivate(&tp->t_timers->tt_keep); if ((inp->inp_flags & INP_DROPPED) != 0) { INP_WUNLOCK(inp); CURVNET_RESTORE(); return; } KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0, ("%s: tp %p tcpcb can't be stopped here", __func__, tp)); /* * Because we don't regularly reset the keepalive callout in * the ESTABLISHED state, it may be that we don't actually need * to send a keepalive yet. If that occurs, schedule another * call for the next time the keepalive timer might expire. */ if (TCPS_HAVEESTABLISHED(tp->t_state)) { u_int idletime; idletime = ticks - tp->t_rcvtime; if (idletime < TP_KEEPIDLE(tp)) { callout_reset(&tp->t_timers->tt_keep, TP_KEEPIDLE(tp) - idletime, tcp_timer_keep, tp); INP_WUNLOCK(inp); CURVNET_RESTORE(); return; } } /* * Keep-alive timer went off; send something * or drop connection if idle for too long. */ TCPSTAT_INC(tcps_keeptimeo); if (tp->t_state < TCPS_ESTABLISHED) goto dropit; if ((tcp_always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) && tp->t_state <= TCPS_CLOSING) { if (ticks - tp->t_rcvtime >= TP_KEEPIDLE(tp) + TP_MAXIDLE(tp)) goto dropit; /* * Send a packet designed to force a response * if the peer is up and reachable: * either an ACK if the connection is still alive, * or an RST if the peer has closed the connection * due to timeout or reboot. * Using sequence number tp->snd_una-1 * causes the transmitted zero-length segment * to lie outside the receive window; * by the protocol spec, this requires the * correspondent TCP to respond. */ TCPSTAT_INC(tcps_keepprobe); t_template = tcpip_maketemplate(inp); if (t_template) { tcp_respond(tp, t_template->tt_ipgen, &t_template->tt_t, (struct mbuf *)NULL, tp->rcv_nxt, tp->snd_una - 1, 0); free(t_template, M_TEMP); } callout_reset(&tp->t_timers->tt_keep, TP_KEEPINTVL(tp), tcp_timer_keep, tp); } else callout_reset(&tp->t_timers->tt_keep, TP_KEEPIDLE(tp), tcp_timer_keep, tp); #ifdef TCPDEBUG if (inp->inp_socket->so_options & SO_DEBUG) tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, PRU_SLOWTIMO); #endif TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); INP_WUNLOCK(inp); CURVNET_RESTORE(); return; dropit: TCPSTAT_INC(tcps_keepdrops); if (tcp_inpinfo_lock_add(inp)) { tcp_inpinfo_lock_del(inp, tp); goto out; } tp = tcp_drop(tp, ETIMEDOUT); #ifdef TCPDEBUG if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, PRU_SLOWTIMO); #endif TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); tcp_inpinfo_lock_del(inp, tp); out: CURVNET_RESTORE(); } void tcp_timer_persist(void *xtp) { struct tcpcb *tp = xtp; struct inpcb *inp; CURVNET_SET(tp->t_vnet); #ifdef TCPDEBUG int ostate; ostate = tp->t_state; #endif inp = tp->t_inpcb; KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); INP_WLOCK(inp); if (callout_pending(&tp->t_timers->tt_persist) || !callout_active(&tp->t_timers->tt_persist)) { INP_WUNLOCK(inp); CURVNET_RESTORE(); return; } callout_deactivate(&tp->t_timers->tt_persist); if ((inp->inp_flags & INP_DROPPED) != 0) { INP_WUNLOCK(inp); CURVNET_RESTORE(); return; } KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0, ("%s: tp %p tcpcb can't be stopped here", __func__, tp)); /* * Persistence timer into zero window. * Force a byte to be output, if possible. */ TCPSTAT_INC(tcps_persisttimeo); /* * Hack: if the peer is dead/unreachable, we do not * time out if the window is closed. After a full * backoff, drop the connection if the idle time * (no responses to probes) reaches the maximum * backoff that we would use if retransmitting. */ if (tp->t_rxtshift == TCP_MAXRXTSHIFT && (ticks - tp->t_rcvtime >= tcp_maxpersistidle || ticks - tp->t_rcvtime >= TCP_REXMTVAL(tp) * tcp_totbackoff)) { TCPSTAT_INC(tcps_persistdrop); if (tcp_inpinfo_lock_add(inp)) { tcp_inpinfo_lock_del(inp, tp); goto out; } tp = tcp_drop(tp, ETIMEDOUT); tcp_inpinfo_lock_del(inp, tp); goto out; } /* * If the user has closed the socket then drop a persisting * connection after a much reduced timeout. */ if (tp->t_state > TCPS_CLOSE_WAIT && (ticks - tp->t_rcvtime) >= TCPTV_PERSMAX) { TCPSTAT_INC(tcps_persistdrop); if (tcp_inpinfo_lock_add(inp)) { tcp_inpinfo_lock_del(inp, tp); goto out; } tp = tcp_drop(tp, ETIMEDOUT); tcp_inpinfo_lock_del(inp, tp); goto out; } tcp_setpersist(tp); tp->t_flags |= TF_FORCEDATA; (void) tp->t_fb->tfb_tcp_output(tp); tp->t_flags &= ~TF_FORCEDATA; #ifdef TCPDEBUG if (tp != NULL && tp->t_inpcb->inp_socket->so_options & SO_DEBUG) tcp_trace(TA_USER, ostate, tp, NULL, NULL, PRU_SLOWTIMO); #endif TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); INP_WUNLOCK(inp); out: CURVNET_RESTORE(); } void tcp_timer_rexmt(void * xtp) { struct tcpcb *tp = xtp; CURVNET_SET(tp->t_vnet); int rexmt; struct inpcb *inp; #ifdef TCPDEBUG int ostate; ostate = tp->t_state; #endif inp = tp->t_inpcb; KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); INP_WLOCK(inp); if (callout_pending(&tp->t_timers->tt_rexmt) || !callout_active(&tp->t_timers->tt_rexmt)) { INP_WUNLOCK(inp); CURVNET_RESTORE(); return; } callout_deactivate(&tp->t_timers->tt_rexmt); if ((inp->inp_flags & INP_DROPPED) != 0) { INP_WUNLOCK(inp); CURVNET_RESTORE(); return; } KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0, ("%s: tp %p tcpcb can't be stopped here", __func__, tp)); tcp_free_sackholes(tp); TCP_LOG_EVENT(tp, NULL, NULL, NULL, TCP_LOG_RTO, 0, 0, NULL, false); if (tp->t_fb->tfb_tcp_rexmit_tmr) { /* The stack has a timer action too. */ (*tp->t_fb->tfb_tcp_rexmit_tmr)(tp); } /* * Retransmission timer went off. Message has not * been acked within retransmit interval. Back off * to a longer retransmit interval and retransmit one segment. */ if (++tp->t_rxtshift > TCP_MAXRXTSHIFT) { tp->t_rxtshift = TCP_MAXRXTSHIFT; TCPSTAT_INC(tcps_timeoutdrop); if (tcp_inpinfo_lock_add(inp)) { tcp_inpinfo_lock_del(inp, tp); goto out; } tp = tcp_drop(tp, ETIMEDOUT); tcp_inpinfo_lock_del(inp, tp); goto out; } if (tp->t_state == TCPS_SYN_SENT) { /* * If the SYN was retransmitted, indicate CWND to be * limited to 1 segment in cc_conn_init(). */ tp->snd_cwnd = 1; } else if (tp->t_rxtshift == 1) { /* * first retransmit; record ssthresh and cwnd so they can * be recovered if this turns out to be a "bad" retransmit. * A retransmit is considered "bad" if an ACK for this * segment is received within RTT/2 interval; the assumption * here is that the ACK was already in flight. See * "On Estimating End-to-End Network Path Properties" by * Allman and Paxson for more details. */ tp->snd_cwnd_prev = tp->snd_cwnd; tp->snd_ssthresh_prev = tp->snd_ssthresh; tp->snd_recover_prev = tp->snd_recover; if (IN_FASTRECOVERY(tp->t_flags)) tp->t_flags |= TF_WASFRECOVERY; else tp->t_flags &= ~TF_WASFRECOVERY; if (IN_CONGRECOVERY(tp->t_flags)) tp->t_flags |= TF_WASCRECOVERY; else tp->t_flags &= ~TF_WASCRECOVERY; if ((tp->t_flags & TF_RCVD_TSTMP) == 0) tp->t_badrxtwin = ticks + (tp->t_srtt >> (TCP_RTT_SHIFT + 1)); /* In the event that we've negotiated timestamps * badrxtwin will be set to the value that we set * the retransmitted packet's to_tsval to by tcp_output */ tp->t_flags |= TF_PREVVALID; } else tp->t_flags &= ~TF_PREVVALID; TCPSTAT_INC(tcps_rexmttimeo); if ((tp->t_state == TCPS_SYN_SENT) || (tp->t_state == TCPS_SYN_RECEIVED)) rexmt = TCPTV_RTOBASE * tcp_syn_backoff[tp->t_rxtshift]; else rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift]; TCPT_RANGESET(tp->t_rxtcur, rexmt, tp->t_rttmin, TCPTV_REXMTMAX); /* * We enter the path for PLMTUD if connection is established or, if * connection is FIN_WAIT_1 status, reason for the last is that if * amount of data we send is very small, we could send it in couple of * packets and process straight to FIN. In that case we won't catch * ESTABLISHED state. */ if (V_tcp_pmtud_blackhole_detect && (((tp->t_state == TCPS_ESTABLISHED)) || (tp->t_state == TCPS_FIN_WAIT_1))) { #ifdef INET6 int isipv6; #endif /* * Idea here is that at each stage of mtu probe (usually, 1448 * -> 1188 -> 524) should be given 2 chances to recover before * further clamping down. 'tp->t_rxtshift % 2 == 0' should * take care of that. */ if (((tp->t_flags2 & (TF2_PLPMTU_PMTUD|TF2_PLPMTU_MAXSEGSNT)) == (TF2_PLPMTU_PMTUD|TF2_PLPMTU_MAXSEGSNT)) && (tp->t_rxtshift >= 2 && tp->t_rxtshift < 6 && tp->t_rxtshift % 2 == 0)) { /* * Enter Path MTU Black-hole Detection mechanism: * - Disable Path MTU Discovery (IP "DF" bit). * - Reduce MTU to lower value than what we * negotiated with peer. */ if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) == 0) { /* Record that we may have found a black hole. */ tp->t_flags2 |= TF2_PLPMTU_BLACKHOLE; /* Keep track of previous MSS. */ tp->t_pmtud_saved_maxseg = tp->t_maxseg; } /* * Reduce the MSS to blackhole value or to the default * in an attempt to retransmit. */ #ifdef INET6 isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) ? 1 : 0; if (isipv6 && tp->t_maxseg > V_tcp_v6pmtud_blackhole_mss) { /* Use the sysctl tuneable blackhole MSS. */ tp->t_maxseg = V_tcp_v6pmtud_blackhole_mss; TCPSTAT_INC(tcps_pmtud_blackhole_activated); } else if (isipv6) { /* Use the default MSS. */ tp->t_maxseg = V_tcp_v6mssdflt; /* * Disable Path MTU Discovery when we switch to * minmss. */ tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss); } #endif #if defined(INET6) && defined(INET) else #endif #ifdef INET if (tp->t_maxseg > V_tcp_pmtud_blackhole_mss) { /* Use the sysctl tuneable blackhole MSS. */ tp->t_maxseg = V_tcp_pmtud_blackhole_mss; TCPSTAT_INC(tcps_pmtud_blackhole_activated); } else { /* Use the default MSS. */ tp->t_maxseg = V_tcp_mssdflt; /* * Disable Path MTU Discovery when we switch to * minmss. */ tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss); } #endif /* * Reset the slow-start flight size * as it may depend on the new MSS. */ if (CC_ALGO(tp)->conn_init != NULL) CC_ALGO(tp)->conn_init(tp->ccv); } else { /* * If further retransmissions are still unsuccessful * with a lowered MTU, maybe this isn't a blackhole and * we restore the previous MSS and blackhole detection * flags. * The limit '6' is determined by giving each probe * stage (1448, 1188, 524) 2 chances to recover. */ if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) && (tp->t_rxtshift >= 6)) { tp->t_flags2 |= TF2_PLPMTU_PMTUD; tp->t_flags2 &= ~TF2_PLPMTU_BLACKHOLE; tp->t_maxseg = tp->t_pmtud_saved_maxseg; TCPSTAT_INC(tcps_pmtud_blackhole_failed); /* * Reset the slow-start flight size as it * may depend on the new MSS. */ if (CC_ALGO(tp)->conn_init != NULL) CC_ALGO(tp)->conn_init(tp->ccv); } } } /* * Disable RFC1323 and SACK if we haven't got any response to * our third SYN to work-around some broken terminal servers * (most of which have hopefully been retired) that have bad VJ * header compression code which trashes TCP segments containing * unknown-to-them TCP options. */ if (tcp_rexmit_drop_options && (tp->t_state == TCPS_SYN_SENT) && (tp->t_rxtshift == 3)) tp->t_flags &= ~(TF_REQ_SCALE|TF_REQ_TSTMP|TF_SACK_PERMIT); /* * If we backed off this far, notify the L3 protocol that we're having * connection problems. */ if (tp->t_rxtshift > TCP_RTT_INVALIDATE) { #ifdef INET6 if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) in6_losing(tp->t_inpcb); else #endif in_losing(tp->t_inpcb); } tp->snd_nxt = tp->snd_una; tp->snd_recover = tp->snd_max; /* * Force a segment to be sent. */ tp->t_flags |= TF_ACKNOW; /* * If timing a segment in this window, stop the timer. */ tp->t_rtttime = 0; cc_cong_signal(tp, NULL, CC_RTO); (void) tp->t_fb->tfb_tcp_output(tp); #ifdef TCPDEBUG if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, PRU_SLOWTIMO); #endif TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); INP_WUNLOCK(inp); out: CURVNET_RESTORE(); } void tcp_timer_activate(struct tcpcb *tp, uint32_t timer_type, u_int delta) { struct callout *t_callout; timeout_t *f_callout; struct inpcb *inp = tp->t_inpcb; int cpu = inp_to_cpuid(inp); #ifdef TCP_OFFLOAD if (tp->t_flags & TF_TOE) return; #endif if (tp->t_timers->tt_flags & TT_STOPPED) return; switch (timer_type) { case TT_DELACK: t_callout = &tp->t_timers->tt_delack; f_callout = tcp_timer_delack; break; case TT_REXMT: t_callout = &tp->t_timers->tt_rexmt; f_callout = tcp_timer_rexmt; break; case TT_PERSIST: t_callout = &tp->t_timers->tt_persist; f_callout = tcp_timer_persist; break; case TT_KEEP: t_callout = &tp->t_timers->tt_keep; f_callout = tcp_timer_keep; break; case TT_2MSL: t_callout = &tp->t_timers->tt_2msl; f_callout = tcp_timer_2msl; break; default: if (tp->t_fb->tfb_tcp_timer_activate) { tp->t_fb->tfb_tcp_timer_activate(tp, timer_type, delta); return; } panic("tp %p bad timer_type %#x", tp, timer_type); } if (delta == 0) { callout_stop(t_callout); } else { callout_reset_on(t_callout, delta, f_callout, tp, cpu); } } int tcp_timer_active(struct tcpcb *tp, uint32_t timer_type) { struct callout *t_callout; switch (timer_type) { case TT_DELACK: t_callout = &tp->t_timers->tt_delack; break; case TT_REXMT: t_callout = &tp->t_timers->tt_rexmt; break; case TT_PERSIST: t_callout = &tp->t_timers->tt_persist; break; case TT_KEEP: t_callout = &tp->t_timers->tt_keep; break; case TT_2MSL: t_callout = &tp->t_timers->tt_2msl; break; default: if (tp->t_fb->tfb_tcp_timer_active) { return(tp->t_fb->tfb_tcp_timer_active(tp, timer_type)); } panic("tp %p bad timer_type %#x", tp, timer_type); } return callout_active(t_callout); +} + +/* + * Stop the timer from running, and apply a flag + * against the timer_flags that will force the + * timer never to run. The flag is needed to assure + * a race does not leave it running and cause + * the timer to possibly restart itself (keep and persist + * especially do this). + */ +int +tcp_timer_suspend(struct tcpcb *tp, uint32_t timer_type) +{ + struct callout *t_callout; + uint32_t t_flags; + + switch (timer_type) { + case TT_DELACK: + t_flags = TT_DELACK_SUS; + t_callout = &tp->t_timers->tt_delack; + break; + case TT_REXMT: + t_flags = TT_REXMT_SUS; + t_callout = &tp->t_timers->tt_rexmt; + break; + case TT_PERSIST: + t_flags = TT_PERSIST_SUS; + t_callout = &tp->t_timers->tt_persist; + break; + case TT_KEEP: + t_flags = TT_KEEP_SUS; + t_callout = &tp->t_timers->tt_keep; + break; + case TT_2MSL: + t_flags = TT_2MSL_SUS; + t_callout = &tp->t_timers->tt_2msl; + break; + default: + panic("tp:%p bad timer_type 0x%x", tp, timer_type); + } + tp->t_timers->tt_flags |= t_flags; + return (callout_stop(t_callout)); +} + +void +tcp_timers_unsuspend(struct tcpcb *tp, uint32_t timer_type) +{ + switch (timer_type) { + case TT_DELACK: + if (tp->t_timers->tt_flags & TT_DELACK_SUS) { + tp->t_timers->tt_flags &= ~TT_DELACK_SUS; + if (tp->t_flags & TF_DELACK) { + /* Delayed ack timer should be up activate a timer */ + tp->t_flags &= ~TF_DELACK; + tcp_timer_activate(tp, TT_DELACK, + tcp_delacktime); + } + } + break; + case TT_REXMT: + if (tp->t_timers->tt_flags & TT_REXMT_SUS) { + tp->t_timers->tt_flags &= ~TT_REXMT_SUS; + if (SEQ_GT(tp->snd_max, tp->snd_una) && + (tcp_timer_active((tp), TT_PERSIST) == 0) && + tp->snd_wnd) { + /* We have outstanding data activate a timer */ + tcp_timer_activate(tp, TT_REXMT, + tp->t_rxtcur); + } + } + break; + case TT_PERSIST: + if (tp->t_timers->tt_flags & TT_PERSIST_SUS) { + tp->t_timers->tt_flags &= ~TT_PERSIST_SUS; + if (tp->snd_wnd == 0) { + /* Activate the persists timer */ + tp->t_rxtshift = 0; + tcp_setpersist(tp); + } + } + break; + case TT_KEEP: + if (tp->t_timers->tt_flags & TT_KEEP_SUS) { + tp->t_timers->tt_flags &= ~TT_KEEP_SUS; + tcp_timer_activate(tp, TT_KEEP, + TCPS_HAVEESTABLISHED(tp->t_state) ? + TP_KEEPIDLE(tp) : TP_KEEPINIT(tp)); + } + break; + case TT_2MSL: + if (tp->t_timers->tt_flags &= TT_2MSL_SUS) { + tp->t_timers->tt_flags &= ~TT_2MSL_SUS; + if ((tp->t_state == TCPS_FIN_WAIT_2) && + ((tp->t_inpcb->inp_socket == NULL) || + (tp->t_inpcb->inp_socket->so_rcv.sb_state & SBS_CANTRCVMORE))) { + /* Star the 2MSL timer */ + tcp_timer_activate(tp, TT_2MSL, + (tcp_fast_finwait2_recycle) ? + tcp_finwait2_timeout : TP_MAXIDLE(tp)); + } + } + break; + default: + panic("tp:%p bad timer_type 0x%x", tp, timer_type); + } } void tcp_timer_stop(struct tcpcb *tp, uint32_t timer_type) { struct callout *t_callout; tp->t_timers->tt_flags |= TT_STOPPED; switch (timer_type) { case TT_DELACK: t_callout = &tp->t_timers->tt_delack; break; case TT_REXMT: t_callout = &tp->t_timers->tt_rexmt; break; case TT_PERSIST: t_callout = &tp->t_timers->tt_persist; break; case TT_KEEP: t_callout = &tp->t_timers->tt_keep; break; case TT_2MSL: t_callout = &tp->t_timers->tt_2msl; break; default: if (tp->t_fb->tfb_tcp_timer_stop) { /* * XXXrrs we need to look at this with the * stop case below (flags). */ tp->t_fb->tfb_tcp_timer_stop(tp, timer_type); return; } panic("tp %p bad timer_type %#x", tp, timer_type); } if (callout_async_drain(t_callout, tcp_timer_discard) == 0) { /* * Can't stop the callout, defer tcpcb actual deletion * to the last one. We do this using the async drain * function and incrementing the count in */ tp->t_timers->tt_draincnt++; } } Index: head/sys/netinet/tcp_timer.h =================================================================== --- head/sys/netinet/tcp_timer.h (revision 334803) +++ head/sys/netinet/tcp_timer.h (revision 334804) @@ -1,226 +1,232 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)tcp_timer.h 8.1 (Berkeley) 6/10/93 * $FreeBSD$ */ #ifndef _NETINET_TCP_TIMER_H_ #define _NETINET_TCP_TIMER_H_ /* * The TCPT_REXMT timer is used to force retransmissions. * The TCP has the TCPT_REXMT timer set whenever segments * have been sent for which ACKs are expected but not yet * received. If an ACK is received which advances tp->snd_una, * then the retransmit timer is cleared (if there are no more * outstanding segments) or reset to the base value (if there * are more ACKs expected). Whenever the retransmit timer goes off, * we retransmit one unacknowledged segment, and do a backoff * on the retransmit timer. * * The TCPT_PERSIST timer is used to keep window size information * flowing even if the window goes shut. If all previous transmissions * have been acknowledged (so that there are no retransmissions in progress), * and the window is too small to bother sending anything, then we start * the TCPT_PERSIST timer. When it expires, if the window is nonzero, * we go to transmit state. Otherwise, at intervals send a single byte * into the peer's window to force him to update our window information. * We do this at most as often as TCPT_PERSMIN time intervals, * but no more frequently than the current estimate of round-trip * packet time. The TCPT_PERSIST timer is cleared whenever we receive * a window update from the peer. * * The TCPT_KEEP timer is used to keep connections alive. If an * connection is idle (no segments received) for TCPTV_KEEP_INIT amount of time, * but not yet established, then we drop the connection. Once the connection * is established, if the connection is idle for TCPTV_KEEP_IDLE time * (and keepalives have been enabled on the socket), we begin to probe * the connection. We force the peer to send us a segment by sending: * * This segment is (deliberately) outside the window, and should elicit * an ack segment in response from the peer. If, despite the TCPT_KEEP * initiated segments we cannot elicit a response from a peer in TCPT_MAXIDLE * amount of time probing, then we drop the connection. */ /* * Time constants. */ #define TCPTV_MSL ( 30*hz) /* max seg lifetime (hah!) */ #define TCPTV_SRTTBASE 0 /* base roundtrip time; if 0, no idea yet */ #define TCPTV_RTOBASE ( 3*hz) /* assumed RTO if no info */ #define TCPTV_PERSMIN ( 5*hz) /* minimum persist interval */ #define TCPTV_PERSMAX ( 60*hz) /* maximum persist interval */ #define TCPTV_KEEP_INIT ( 75*hz) /* initial connect keepalive */ #define TCPTV_KEEP_IDLE (120*60*hz) /* dflt time before probing */ #define TCPTV_KEEPINTVL ( 75*hz) /* default probe interval */ #define TCPTV_KEEPCNT 8 /* max probes before drop */ #define TCPTV_FINWAIT2_TIMEOUT (60*hz) /* FIN_WAIT_2 timeout if no receiver */ /* * Minimum retransmit timer is 3 ticks, for algorithmic stability. * TCPT_RANGESET() will add another TCPTV_CPU_VAR to deal with * the expected worst-case processing variances by the kernels * representing the end points. Such variances do not always show * up in the srtt because the timestamp is often calculated at * the interface rather then at the TCP layer. This value is * typically 50ms. However, it is also possible that delayed * acks (typically 100ms) could create issues so we set the slop * to 200ms to try to cover it. Note that, properly speaking, * delayed-acks should not create a major issue for interactive * environments which 'P'ush the last segment, at least as * long as implementations do the required 'at least one ack * for every two packets' for the non-interactive streaming case. * (maybe the RTO calculation should use 2*RTT instead of RTT * to handle the ack-every-other-packet case). * * The prior minimum of 1*hz (1 second) badly breaks throughput on any * networks faster then a modem that has minor (e.g. 1%) packet loss. */ #define TCPTV_MIN ( hz/33 ) /* minimum allowable value */ #define TCPTV_CPU_VAR ( hz/5 ) /* cpu variance allowed (200ms) */ #define TCPTV_REXMTMAX ( 64*hz) /* max allowable REXMT value */ #define TCPTV_TWTRUNC 8 /* RTO factor to truncate TW */ #define TCP_LINGERTIME 120 /* linger at most 2 minutes */ #define TCP_MAXRXTSHIFT 12 /* maximum retransmits */ #define TCPTV_DELACK ( hz/10 ) /* 100ms timeout */ /* * If we exceed this number of retransmits for a single segment, we'll consider * the current srtt measurement no longer valid and will recalculate from * scratch starting with the next ACK. */ #define TCP_RTT_INVALIDATE (TCP_MAXRXTSHIFT / 4) #ifdef TCPTIMERS static const char *tcptimers[] = { "REXMT", "PERSIST", "KEEP", "2MSL", "DELACK" }; #endif /* * Force a time value to be in a certain range. */ #define TCPT_RANGESET(tv, value, tvmin, tvmax) do { \ (tv) = (value) + tcp_rexmit_slop; \ if ((u_long)(tv) < (u_long)(tvmin)) \ (tv) = (tvmin); \ if ((u_long)(tv) > (u_long)(tvmax)) \ (tv) = (tvmax); \ } while(0) #ifdef _KERNEL struct xtcp_timer; struct tcp_timer { struct callout tt_rexmt; /* retransmit timer */ struct callout tt_persist; /* retransmit persistence */ struct callout tt_keep; /* keepalive */ struct callout tt_2msl; /* 2*msl TIME_WAIT timer */ struct callout tt_delack; /* delayed ACK timer */ uint32_t tt_flags; /* Timers flags */ uint32_t tt_draincnt; /* Count being drained */ }; /* * Flags for the tt_flags field. */ #define TT_DELACK 0x0001 #define TT_REXMT 0x0002 #define TT_PERSIST 0x0004 #define TT_KEEP 0x0008 #define TT_2MSL 0x0010 #define TT_MASK (TT_DELACK|TT_REXMT|TT_PERSIST|TT_KEEP|TT_2MSL) -#define TT_DELACK_RST 0x0100 -#define TT_REXMT_RST 0x0200 -#define TT_PERSIST_RST 0x0400 -#define TT_KEEP_RST 0x0800 -#define TT_2MSL_RST 0x1000 +/* + * Suspend flags - used when suspending a timer + * from ever running again. + */ +#define TT_DELACK_SUS 0x0100 +#define TT_REXMT_SUS 0x0200 +#define TT_PERSIST_SUS 0x0400 +#define TT_KEEP_SUS 0x0800 +#define TT_2MSL_SUS 0x1000 #define TT_STOPPED 0x00010000 #define TP_KEEPINIT(tp) ((tp)->t_keepinit ? (tp)->t_keepinit : tcp_keepinit) #define TP_KEEPIDLE(tp) ((tp)->t_keepidle ? (tp)->t_keepidle : tcp_keepidle) #define TP_KEEPINTVL(tp) ((tp)->t_keepintvl ? (tp)->t_keepintvl : tcp_keepintvl) #define TP_KEEPCNT(tp) ((tp)->t_keepcnt ? (tp)->t_keepcnt : tcp_keepcnt) #define TP_MAXIDLE(tp) (TP_KEEPCNT(tp) * TP_KEEPINTVL(tp)) extern int tcp_persmin; /* minimum persist interval */ extern int tcp_persmax; /* maximum persist interval */ extern int tcp_keepinit; /* time to establish connection */ extern int tcp_keepidle; /* time before keepalive probes begin */ extern int tcp_keepintvl; /* time between keepalive probes */ extern int tcp_keepcnt; /* number of keepalives */ extern int tcp_delacktime; /* time before sending a delayed ACK */ extern int tcp_maxpersistidle; extern int tcp_rexmit_min; extern int tcp_rexmit_slop; extern int tcp_msl; extern int tcp_ttl; /* time to live for TCP segs */ extern int tcp_backoff[]; extern int tcp_syn_backoff[]; +extern int tcp_totbackoff; +extern int tcp_rexmit_drop_options; extern int tcp_always_keepalive; extern int tcp_finwait2_timeout; extern int tcp_fast_finwait2_recycle; VNET_DECLARE(int, tcp_pmtud_blackhole_detect); #define V_tcp_pmtud_blackhole_detect VNET(tcp_pmtud_blackhole_detect) VNET_DECLARE(int, tcp_pmtud_blackhole_mss); #define V_tcp_pmtud_blackhole_mss VNET(tcp_pmtud_blackhole_mss) VNET_DECLARE(int, tcp_v6pmtud_blackhole_mss); #define V_tcp_v6pmtud_blackhole_mss VNET(tcp_v6pmtud_blackhole_mss) int tcp_inpinfo_lock_add(struct inpcb *inp); void tcp_inpinfo_lock_del(struct inpcb *inp, struct tcpcb *tp); void tcp_timer_init(void); void tcp_timer_2msl(void *xtp); void tcp_timer_discard(void *); struct tcptw * tcp_tw_2msl_scan(int reuse); /* XXX temporary? */ void tcp_timer_keep(void *xtp); void tcp_timer_persist(void *xtp); void tcp_timer_rexmt(void *xtp); void tcp_timer_delack(void *xtp); #endif /* _KERNEL */ #endif /* !_NETINET_TCP_TIMER_H_ */ Index: head/sys/netinet/tcp_var.h =================================================================== --- head/sys/netinet/tcp_var.h (revision 334803) +++ head/sys/netinet/tcp_var.h (revision 334804) @@ -1,946 +1,966 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1993, 1994, 1995 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)tcp_var.h 8.4 (Berkeley) 5/24/95 * $FreeBSD$ */ #ifndef _NETINET_TCP_VAR_H_ #define _NETINET_TCP_VAR_H_ #include #include #ifdef _KERNEL #include #include #endif #if defined(_KERNEL) || defined(_WANT_TCPCB) /* TCP segment queue entry */ struct tseg_qent { LIST_ENTRY(tseg_qent) tqe_q; int tqe_len; /* TCP segment data length */ struct tcphdr *tqe_th; /* a pointer to tcp header */ struct mbuf *tqe_m; /* mbuf contains packet */ }; LIST_HEAD(tsegqe_head, tseg_qent); struct sackblk { tcp_seq start; /* start seq no. of sack block */ tcp_seq end; /* end seq no. */ }; struct sackhole { tcp_seq start; /* start seq no. of hole */ tcp_seq end; /* end seq no. */ tcp_seq rxmit; /* next seq. no in hole to be retransmitted */ TAILQ_ENTRY(sackhole) scblink; /* scoreboard linkage */ }; struct sackhint { struct sackhole *nexthole; int sack_bytes_rexmit; tcp_seq last_sack_ack; /* Most recent/largest sacked ack */ int ispare; /* explicit pad for 64bit alignment */ int sacked_bytes; /* * Total sacked bytes reported by the * receiver via sack option */ uint32_t _pad1[1]; /* TBD */ uint64_t _pad[1]; /* TBD */ }; STAILQ_HEAD(tcp_log_stailq, tcp_log_mem); /* * Tcp control block, one per tcp; fields: * Organized for 64 byte cacheline efficiency based * on common tcp_input/tcp_output processing. */ struct tcpcb { /* Cache line 1 */ struct inpcb *t_inpcb; /* back pointer to internet pcb */ struct tcp_function_block *t_fb;/* TCP function call block */ void *t_fb_ptr; /* Pointer to t_fb specific data */ uint32_t t_maxseg:24, /* maximum segment size */ t_logstate:8; /* State of "black box" logging */ - uint32_t t_state:4, /* state of this connection */ - bits_spare : 24; + uint32_t t_port:16, /* Tunneling (over udp) port */ + t_state:4, /* state of this connection */ + t_idle_reduce : 1, + t_delayed_ack: 7, /* Delayed ack variable */ + bits_spare : 4; u_int t_flags; tcp_seq snd_una; /* sent but unacknowledged */ tcp_seq snd_max; /* highest sequence number sent; * used to recognize retransmits */ tcp_seq snd_nxt; /* send next */ tcp_seq snd_up; /* send urgent pointer */ uint32_t snd_wnd; /* send window */ uint32_t snd_cwnd; /* congestion-controlled window */ - uint32_t cl1_spare; /* Spare to round out CL 1 */ + uint32_t t_peakrate_thr; /* pre-calculated peak rate threshold */ /* Cache line 2 */ u_int32_t ts_offset; /* our timestamp offset */ u_int32_t rfbuf_ts; /* recv buffer autoscaling timestamp */ int rcv_numsacks; /* # distinct sack blks present */ u_int t_tsomax; /* TSO total burst length limit in bytes */ u_int t_tsomaxsegcount; /* TSO maximum segment count */ u_int t_tsomaxsegsize; /* TSO maximum segment size in bytes */ tcp_seq rcv_nxt; /* receive next */ tcp_seq rcv_adv; /* advertised window */ uint32_t rcv_wnd; /* receive window */ u_int t_flags2; /* More tcpcb flags storage */ int t_srtt; /* smoothed round-trip time */ int t_rttvar; /* variance in round-trip time */ u_int32_t ts_recent; /* timestamp echo data */ u_char snd_scale; /* window scaling for send window */ u_char rcv_scale; /* window scaling for recv window */ u_char snd_limited; /* segments limited transmitted */ u_char request_r_scale; /* pending window scaling */ tcp_seq last_ack_sent; u_int t_rcvtime; /* inactivity time */ /* Cache line 3 */ tcp_seq rcv_up; /* receive urgent pointer */ int t_segqlen; /* segment reassembly queue length */ struct tsegqe_head t_segq; /* segment reassembly queue */ struct mbuf *t_in_pkt; struct mbuf *t_tail_pkt; struct tcp_timer *t_timers; /* All the TCP timers in one struct */ struct vnet *t_vnet; /* back pointer to parent vnet */ uint32_t snd_ssthresh; /* snd_cwnd size threshold for * for slow start exponential to * linear switch */ tcp_seq snd_wl1; /* window update seg seq number */ /* Cache line 4 */ tcp_seq snd_wl2; /* window update seg ack number */ tcp_seq irs; /* initial receive sequence number */ tcp_seq iss; /* initial send sequence number */ u_int t_acktime; u_int ts_recent_age; /* when last updated */ tcp_seq snd_recover; /* for use in NewReno Fast Recovery */ uint16_t cl4_spare; /* Spare to adjust CL 4 */ char t_oobflags; /* have some */ char t_iobc; /* input character */ int t_rxtcur; /* current retransmit value (ticks) */ int t_rxtshift; /* log(2) of rexmt exp. backoff */ u_int t_rtttime; /* RTT measurement start time */ tcp_seq t_rtseq; /* sequence number being timed */ u_int t_starttime; /* time connection was established */ u_int t_pmtud_saved_maxseg; /* pre-blackhole MSS */ u_int t_rttmin; /* minimum rtt allowed */ u_int t_rttbest; /* best rtt we've seen */ int t_softerror; /* possible error not yet reported */ uint32_t max_sndwnd; /* largest window peer has offered */ /* Cache line 5 */ uint32_t snd_cwnd_prev; /* cwnd prior to retransmit */ uint32_t snd_ssthresh_prev; /* ssthresh prior to retransmit */ tcp_seq snd_recover_prev; /* snd_recover prior to retransmit */ int t_sndzerowin; /* zero-window updates sent */ u_long t_rttupdated; /* number of times rtt sampled */ int snd_numholes; /* number of holes seen by sender */ u_int t_badrxtwin; /* window for retransmit recovery */ TAILQ_HEAD(sackhole_head, sackhole) snd_holes; /* SACK scoreboard (sorted) */ tcp_seq snd_fack; /* last seq number(+1) sack'd by rcv'r*/ tcp_seq sack_newdata; /* New data xmitted in this recovery episode starts at this seq number */ struct sackblk sackblks[MAX_SACK_BLKS]; /* seq nos. of sack blocks */ struct sackhint sackhint; /* SACK scoreboard hint */ int t_rttlow; /* smallest observerved RTT */ int rfbuf_cnt; /* recv buffer autoscaling byte count */ struct toedev *tod; /* toedev handling this connection */ int t_sndrexmitpack; /* retransmit packets sent */ int t_rcvoopack; /* out-of-order packets received */ void *t_toe; /* TOE pcb pointer */ struct cc_algo *cc_algo; /* congestion control algorithm */ struct cc_var *ccv; /* congestion control specific vars */ struct osd *osd; /* storage for Khelp module data */ int t_bytes_acked; /* # bytes acked during current RTT */ + u_int t_maxunacktime; u_int t_keepinit; /* time to establish connection */ u_int t_keepidle; /* time before keepalive probes begin */ u_int t_keepintvl; /* interval between keepalives */ u_int t_keepcnt; /* number of keepalives before close */ int t_dupacks; /* consecutive dup acks recd */ int t_lognum; /* Number of log entries */ struct tcp_log_stailq t_logs; /* Log buffer */ struct tcp_log_id_node *t_lin; struct tcp_log_id_bucket *t_lib; const char *t_output_caller; /* Function that called tcp_output */ uint32_t t_logsn; /* Log "serial number" */ uint8_t t_tfo_client_cookie_len; /* TCP Fast Open client cookie length */ unsigned int *t_tfo_pending; /* TCP Fast Open server pending counter */ union { uint8_t client[TCP_FASTOPEN_MAX_COOKIE_LEN]; uint64_t server; } t_tfo_cookie; /* TCP Fast Open cookie to send */ #ifdef TCPPCAP struct mbufq t_inpkts; /* List of saved input packets. */ struct mbufq t_outpkts; /* List of saved output packets. */ #endif }; #endif /* _KERNEL || _WANT_TCPCB */ #ifdef _KERNEL struct tcptemp { u_char tt_ipgen[40]; /* the size must be of max ip header, now IPv6 */ struct tcphdr tt_t; }; /* * TODO: We yet need to brave plowing in * to tcp_input() and the pru_usrreq() block. * Right now these go to the old standards which * are somewhat ok, but in the long term may * need to be changed. If we do tackle tcp_input() * then we need to get rid of the tcp_do_segment() * function below. */ /* Flags for tcp functions */ #define TCP_FUNC_BEING_REMOVED 0x01 /* Can no longer be referenced */ /* * If defining the optional tcp_timers, in the * tfb_tcp_timer_stop call you must use the * callout_async_drain() function with the * tcp_timer_discard callback. You should check * the return of callout_async_drain() and if 0 * increment tt_draincnt. Since the timer sub-system * does not know your callbacks you must provide a * stop_all function that loops through and calls * tcp_timer_stop() with each of your defined timers. * Adding a tfb_tcp_handoff_ok function allows the socket * option to change stacks to query you even if the * connection is in a later stage. You return 0 to * say you can take over and run your stack, you return * non-zero (an error number) to say no you can't. * If the function is undefined you can only change * in the early states (before connect or listen). * tfb_tcp_fb_fini is changed to add a flag to tell * the old stack if the tcb is being destroyed or * not. A one in the flag means the TCB is being * destroyed, a zero indicates its transitioning to * another stack (via socket option). */ struct tcp_function_block { char tfb_tcp_block_name[TCP_FUNCTION_NAME_LEN_MAX]; int (*tfb_tcp_output)(struct tcpcb *); int (*tfb_tcp_output_wtime)(struct tcpcb *, const struct timeval *); void (*tfb_tcp_do_segment)(struct mbuf *, struct tcphdr *, struct socket *, struct tcpcb *, int, int, uint8_t, int); void (*tfb_tcp_hpts_do_segment)(struct mbuf *, struct tcphdr *, struct socket *, struct tcpcb *, int, int, uint8_t, int, int, struct timeval *); int (*tfb_tcp_ctloutput)(struct socket *so, struct sockopt *sopt, struct inpcb *inp, struct tcpcb *tp); /* Optional memory allocation/free routine */ int (*tfb_tcp_fb_init)(struct tcpcb *); void (*tfb_tcp_fb_fini)(struct tcpcb *, int); /* Optional timers, must define all if you define one */ int (*tfb_tcp_timer_stop_all)(struct tcpcb *); void (*tfb_tcp_timer_activate)(struct tcpcb *, uint32_t, u_int); int (*tfb_tcp_timer_active)(struct tcpcb *, uint32_t); void (*tfb_tcp_timer_stop)(struct tcpcb *, uint32_t); void (*tfb_tcp_rexmit_tmr)(struct tcpcb *); int (*tfb_tcp_handoff_ok)(struct tcpcb *); void (*tfb_tcp_mtu_chg)(struct tcpcb *); volatile uint32_t tfb_refcnt; uint32_t tfb_flags; uint8_t tfb_id; }; struct tcp_function { TAILQ_ENTRY(tcp_function) tf_next; char tf_name[TCP_FUNCTION_NAME_LEN_MAX]; struct tcp_function_block *tf_fb; }; TAILQ_HEAD(tcp_funchead, tcp_function); #endif /* _KERNEL */ /* * Flags and utility macros for the t_flags field. */ #define TF_ACKNOW 0x000001 /* ack peer immediately */ #define TF_DELACK 0x000002 /* ack, but try to delay it */ #define TF_NODELAY 0x000004 /* don't delay packets to coalesce */ #define TF_NOOPT 0x000008 /* don't use tcp options */ #define TF_SENTFIN 0x000010 /* have sent FIN */ #define TF_REQ_SCALE 0x000020 /* have/will request window scaling */ #define TF_RCVD_SCALE 0x000040 /* other side has requested scaling */ #define TF_REQ_TSTMP 0x000080 /* have/will request timestamps */ #define TF_RCVD_TSTMP 0x000100 /* a timestamp was received in SYN */ #define TF_SACK_PERMIT 0x000200 /* other side said I could SACK */ #define TF_NEEDSYN 0x000400 /* send SYN (implicit state) */ #define TF_NEEDFIN 0x000800 /* send FIN (implicit state) */ #define TF_NOPUSH 0x001000 /* don't push */ #define TF_PREVVALID 0x002000 /* saved values for bad rxmit valid */ #define TF_MORETOCOME 0x010000 /* More data to be appended to sock */ #define TF_LQ_OVERFLOW 0x020000 /* listen queue overflow */ #define TF_LASTIDLE 0x040000 /* connection was previously idle */ #define TF_RXWIN0SENT 0x080000 /* sent a receiver win 0 in response */ #define TF_FASTRECOVERY 0x100000 /* in NewReno Fast Recovery */ #define TF_WASFRECOVERY 0x200000 /* was in NewReno Fast Recovery */ #define TF_SIGNATURE 0x400000 /* require MD5 digests (RFC2385) */ #define TF_FORCEDATA 0x800000 /* force out a byte */ #define TF_TSO 0x1000000 /* TSO enabled on this connection */ #define TF_TOE 0x2000000 /* this connection is offloaded */ #define TF_ECN_PERMIT 0x4000000 /* connection ECN-ready */ #define TF_ECN_SND_CWR 0x8000000 /* ECN CWR in queue */ #define TF_ECN_SND_ECE 0x10000000 /* ECN ECE in queue */ #define TF_CONGRECOVERY 0x20000000 /* congestion recovery mode */ #define TF_WASCRECOVERY 0x40000000 /* was in congestion recovery */ #define TF_FASTOPEN 0x80000000 /* TCP Fast Open indication */ #define IN_FASTRECOVERY(t_flags) (t_flags & TF_FASTRECOVERY) #define ENTER_FASTRECOVERY(t_flags) t_flags |= TF_FASTRECOVERY #define EXIT_FASTRECOVERY(t_flags) t_flags &= ~TF_FASTRECOVERY #define IN_CONGRECOVERY(t_flags) (t_flags & TF_CONGRECOVERY) #define ENTER_CONGRECOVERY(t_flags) t_flags |= TF_CONGRECOVERY #define EXIT_CONGRECOVERY(t_flags) t_flags &= ~TF_CONGRECOVERY #define IN_RECOVERY(t_flags) (t_flags & (TF_CONGRECOVERY | TF_FASTRECOVERY)) #define ENTER_RECOVERY(t_flags) t_flags |= (TF_CONGRECOVERY | TF_FASTRECOVERY) #define EXIT_RECOVERY(t_flags) t_flags &= ~(TF_CONGRECOVERY | TF_FASTRECOVERY) #if defined(_KERNEL) && !defined(TCP_RFC7413) #define IS_FASTOPEN(t_flags) (false) #else #define IS_FASTOPEN(t_flags) (t_flags & TF_FASTOPEN) #endif #define BYTES_THIS_ACK(tp, th) (th->th_ack - tp->snd_una) /* * Flags for the t_oobflags field. */ #define TCPOOB_HAVEDATA 0x01 #define TCPOOB_HADDATA 0x02 /* * Flags for the extended TCP flags field, t_flags2 */ #define TF2_PLPMTU_BLACKHOLE 0x00000001 /* Possible PLPMTUD Black Hole. */ #define TF2_PLPMTU_PMTUD 0x00000002 /* Allowed to attempt PLPMTUD. */ #define TF2_PLPMTU_MAXSEGSNT 0x00000004 /* Last seg sent was full seg. */ #define TF2_LOG_AUTO 0x00000008 /* Session is auto-logging. */ +#define TF2_DROP_AF_DATA 0x00000010 /* Drop after all data ack'd */ /* * Structure to hold TCP options that are only used during segment * processing (in tcp_input), but not held in the tcpcb. * It's basically used to reduce the number of parameters * to tcp_dooptions and tcp_addoptions. * The binary order of the to_flags is relevant for packing of the * options in tcp_addoptions. */ struct tcpopt { u_int32_t to_flags; /* which options are present */ #define TOF_MSS 0x0001 /* maximum segment size */ #define TOF_SCALE 0x0002 /* window scaling */ #define TOF_SACKPERM 0x0004 /* SACK permitted */ #define TOF_TS 0x0010 /* timestamp */ #define TOF_SIGNATURE 0x0040 /* TCP-MD5 signature option (RFC2385) */ #define TOF_SACK 0x0080 /* Peer sent SACK option */ #define TOF_FASTOPEN 0x0100 /* TCP Fast Open (TFO) cookie */ #define TOF_MAXOPT 0x0200 u_int32_t to_tsval; /* new timestamp */ u_int32_t to_tsecr; /* reflected timestamp */ u_char *to_sacks; /* pointer to the first SACK blocks */ u_char *to_signature; /* pointer to the TCP-MD5 signature */ u_int8_t *to_tfo_cookie; /* pointer to the TFO cookie */ u_int16_t to_mss; /* maximum segment size */ u_int8_t to_wscale; /* window scaling */ u_int8_t to_nsacks; /* number of SACK blocks */ u_int8_t to_tfo_len; /* TFO cookie length */ u_int32_t to_spare; /* UTO */ }; /* * Flags for tcp_dooptions. */ #define TO_SYN 0x01 /* parse SYN-only options */ struct hc_metrics_lite { /* must stay in sync with hc_metrics */ uint32_t rmx_mtu; /* MTU for this path */ uint32_t rmx_ssthresh; /* outbound gateway buffer limit */ uint32_t rmx_rtt; /* estimated round trip time */ uint32_t rmx_rttvar; /* estimated rtt variance */ uint32_t rmx_cwnd; /* congestion window */ uint32_t rmx_sendpipe; /* outbound delay-bandwidth product */ uint32_t rmx_recvpipe; /* inbound delay-bandwidth product */ }; /* * Used by tcp_maxmtu() to communicate interface specific features * and limits at the time of connection setup. */ struct tcp_ifcap { int ifcap; u_int tsomax; u_int tsomaxsegcount; u_int tsomaxsegsize; }; #ifndef _NETINET_IN_PCB_H_ struct in_conninfo; #endif /* _NETINET_IN_PCB_H_ */ struct tcptw { struct inpcb *tw_inpcb; /* XXX back pointer to internet pcb */ tcp_seq snd_nxt; tcp_seq rcv_nxt; tcp_seq iss; tcp_seq irs; u_short last_win; /* cached window value */ short tw_so_options; /* copy of so_options */ struct ucred *tw_cred; /* user credentials */ u_int32_t t_recent; u_int32_t ts_offset; /* our timestamp offset */ u_int t_starttime; int tw_time; TAILQ_ENTRY(tcptw) tw_2msl; void *tw_pspare; /* TCP_SIGNATURE */ u_int *tw_spare; /* TCP_SIGNATURE */ }; #define intotcpcb(ip) ((struct tcpcb *)(ip)->inp_ppcb) #define intotw(ip) ((struct tcptw *)(ip)->inp_ppcb) #define sototcpcb(so) (intotcpcb(sotoinpcb(so))) /* * The smoothed round-trip time and estimated variance * are stored as fixed point numbers scaled by the values below. * For convenience, these scales are also used in smoothing the average * (smoothed = (1/scale)sample + ((scale-1)/scale)smoothed). * With these scales, srtt has 3 bits to the right of the binary point, * and thus an "ALPHA" of 0.875. rttvar has 2 bits to the right of the * binary point, and is smoothed with an ALPHA of 0.75. */ #define TCP_RTT_SCALE 32 /* multiplier for srtt; 3 bits frac. */ #define TCP_RTT_SHIFT 5 /* shift for srtt; 3 bits frac. */ #define TCP_RTTVAR_SCALE 16 /* multiplier for rttvar; 2 bits */ #define TCP_RTTVAR_SHIFT 4 /* shift for rttvar; 2 bits */ #define TCP_DELTA_SHIFT 2 /* see tcp_input.c */ /* * The initial retransmission should happen at rtt + 4 * rttvar. * Because of the way we do the smoothing, srtt and rttvar * will each average +1/2 tick of bias. When we compute * the retransmit timer, we want 1/2 tick of rounding and * 1 extra tick because of +-1/2 tick uncertainty in the * firing of the timer. The bias will give us exactly the * 1.5 tick we need. But, because the bias is * statistical, we have to test that we don't drop below * the minimum feasible timer (which is 2 ticks). * This version of the macro adapted from a paper by Lawrence * Brakmo and Larry Peterson which outlines a problem caused * by insufficient precision in the original implementation, * which results in inappropriately large RTO values for very * fast networks. */ #define TCP_REXMTVAL(tp) \ max((tp)->t_rttmin, (((tp)->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT)) \ + (tp)->t_rttvar) >> TCP_DELTA_SHIFT) /* * TCP statistics. * Many of these should be kept per connection, * but that's inconvenient at the moment. */ struct tcpstat { uint64_t tcps_connattempt; /* connections initiated */ uint64_t tcps_accepts; /* connections accepted */ uint64_t tcps_connects; /* connections established */ uint64_t tcps_drops; /* connections dropped */ uint64_t tcps_conndrops; /* embryonic connections dropped */ uint64_t tcps_minmssdrops; /* average minmss too low drops */ uint64_t tcps_closed; /* conn. closed (includes drops) */ uint64_t tcps_segstimed; /* segs where we tried to get rtt */ uint64_t tcps_rttupdated; /* times we succeeded */ uint64_t tcps_delack; /* delayed acks sent */ uint64_t tcps_timeoutdrop; /* conn. dropped in rxmt timeout */ uint64_t tcps_rexmttimeo; /* retransmit timeouts */ uint64_t tcps_persisttimeo; /* persist timeouts */ uint64_t tcps_keeptimeo; /* keepalive timeouts */ uint64_t tcps_keepprobe; /* keepalive probes sent */ uint64_t tcps_keepdrops; /* connections dropped in keepalive */ uint64_t tcps_sndtotal; /* total packets sent */ uint64_t tcps_sndpack; /* data packets sent */ uint64_t tcps_sndbyte; /* data bytes sent */ uint64_t tcps_sndrexmitpack; /* data packets retransmitted */ uint64_t tcps_sndrexmitbyte; /* data bytes retransmitted */ uint64_t tcps_sndrexmitbad; /* unnecessary packet retransmissions */ uint64_t tcps_sndacks; /* ack-only packets sent */ uint64_t tcps_sndprobe; /* window probes sent */ uint64_t tcps_sndurg; /* packets sent with URG only */ uint64_t tcps_sndwinup; /* window update-only packets sent */ uint64_t tcps_sndctrl; /* control (SYN|FIN|RST) packets sent */ uint64_t tcps_rcvtotal; /* total packets received */ uint64_t tcps_rcvpack; /* packets received in sequence */ uint64_t tcps_rcvbyte; /* bytes received in sequence */ uint64_t tcps_rcvbadsum; /* packets received with ccksum errs */ uint64_t tcps_rcvbadoff; /* packets received with bad offset */ uint64_t tcps_rcvreassfull; /* packets dropped for no reass space */ uint64_t tcps_rcvshort; /* packets received too short */ uint64_t tcps_rcvduppack; /* duplicate-only packets received */ uint64_t tcps_rcvdupbyte; /* duplicate-only bytes received */ uint64_t tcps_rcvpartduppack; /* packets with some duplicate data */ uint64_t tcps_rcvpartdupbyte; /* dup. bytes in part-dup. packets */ uint64_t tcps_rcvoopack; /* out-of-order packets received */ uint64_t tcps_rcvoobyte; /* out-of-order bytes received */ uint64_t tcps_rcvpackafterwin; /* packets with data after window */ uint64_t tcps_rcvbyteafterwin; /* bytes rcvd after window */ uint64_t tcps_rcvafterclose; /* packets rcvd after "close" */ uint64_t tcps_rcvwinprobe; /* rcvd window probe packets */ uint64_t tcps_rcvdupack; /* rcvd duplicate acks */ uint64_t tcps_rcvacktoomuch; /* rcvd acks for unsent data */ uint64_t tcps_rcvackpack; /* rcvd ack packets */ uint64_t tcps_rcvackbyte; /* bytes acked by rcvd acks */ uint64_t tcps_rcvwinupd; /* rcvd window update packets */ uint64_t tcps_pawsdrop; /* segments dropped due to PAWS */ uint64_t tcps_predack; /* times hdr predict ok for acks */ uint64_t tcps_preddat; /* times hdr predict ok for data pkts */ uint64_t tcps_pcbcachemiss; uint64_t tcps_cachedrtt; /* times cached RTT in route updated */ uint64_t tcps_cachedrttvar; /* times cached rttvar updated */ uint64_t tcps_cachedssthresh; /* times cached ssthresh updated */ uint64_t tcps_usedrtt; /* times RTT initialized from route */ uint64_t tcps_usedrttvar; /* times RTTVAR initialized from rt */ uint64_t tcps_usedssthresh; /* times ssthresh initialized from rt*/ uint64_t tcps_persistdrop; /* timeout in persist state */ uint64_t tcps_badsyn; /* bogus SYN, e.g. premature ACK */ uint64_t tcps_mturesent; /* resends due to MTU discovery */ uint64_t tcps_listendrop; /* listen queue overflows */ uint64_t tcps_badrst; /* ignored RSTs in the window */ uint64_t tcps_sc_added; /* entry added to syncache */ uint64_t tcps_sc_retransmitted; /* syncache entry was retransmitted */ uint64_t tcps_sc_dupsyn; /* duplicate SYN packet */ uint64_t tcps_sc_dropped; /* could not reply to packet */ uint64_t tcps_sc_completed; /* successful extraction of entry */ uint64_t tcps_sc_bucketoverflow;/* syncache per-bucket limit hit */ uint64_t tcps_sc_cacheoverflow; /* syncache cache limit hit */ uint64_t tcps_sc_reset; /* RST removed entry from syncache */ uint64_t tcps_sc_stale; /* timed out or listen socket gone */ uint64_t tcps_sc_aborted; /* syncache entry aborted */ uint64_t tcps_sc_badack; /* removed due to bad ACK */ uint64_t tcps_sc_unreach; /* ICMP unreachable received */ uint64_t tcps_sc_zonefail; /* zalloc() failed */ uint64_t tcps_sc_sendcookie; /* SYN cookie sent */ uint64_t tcps_sc_recvcookie; /* SYN cookie received */ uint64_t tcps_hc_added; /* entry added to hostcache */ uint64_t tcps_hc_bucketoverflow;/* hostcache per bucket limit hit */ uint64_t tcps_finwait2_drops; /* Drop FIN_WAIT_2 connection after time limit */ /* SACK related stats */ uint64_t tcps_sack_recovery_episode; /* SACK recovery episodes */ uint64_t tcps_sack_rexmits; /* SACK rexmit segments */ uint64_t tcps_sack_rexmit_bytes; /* SACK rexmit bytes */ uint64_t tcps_sack_rcv_blocks; /* SACK blocks (options) received */ uint64_t tcps_sack_send_blocks; /* SACK blocks (options) sent */ uint64_t tcps_sack_sboverflow; /* times scoreboard overflowed */ /* ECN related stats */ uint64_t tcps_ecn_ce; /* ECN Congestion Experienced */ uint64_t tcps_ecn_ect0; /* ECN Capable Transport */ uint64_t tcps_ecn_ect1; /* ECN Capable Transport */ uint64_t tcps_ecn_shs; /* ECN successful handshakes */ uint64_t tcps_ecn_rcwnd; /* # times ECN reduced the cwnd */ /* TCP_SIGNATURE related stats */ uint64_t tcps_sig_rcvgoodsig; /* Total matching signature received */ uint64_t tcps_sig_rcvbadsig; /* Total bad signature received */ uint64_t tcps_sig_err_buildsig; /* Failed to make signature */ uint64_t tcps_sig_err_sigopt; /* No signature expected by socket */ uint64_t tcps_sig_err_nosigopt; /* No signature provided by segment */ /* Path MTU Discovery Black Hole Detection related stats */ uint64_t tcps_pmtud_blackhole_activated; /* Black Hole Count */ uint64_t tcps_pmtud_blackhole_activated_min_mss; /* BH at min MSS Count */ uint64_t tcps_pmtud_blackhole_failed; /* Black Hole Failure Count */ uint64_t _pad[12]; /* 6 UTO, 6 TBD */ }; #define tcps_rcvmemdrop tcps_rcvreassfull /* compat */ #ifdef _KERNEL #define TI_UNLOCKED 1 #define TI_RLOCKED 2 #include VNET_PCPUSTAT_DECLARE(struct tcpstat, tcpstat); /* tcp statistics */ /* * In-kernel consumers can use these accessor macros directly to update * stats. */ #define TCPSTAT_ADD(name, val) \ VNET_PCPUSTAT_ADD(struct tcpstat, tcpstat, name, (val)) #define TCPSTAT_INC(name) TCPSTAT_ADD(name, 1) /* * Kernel module consumers must use this accessor macro. */ void kmod_tcpstat_inc(int statnum); #define KMOD_TCPSTAT_INC(name) \ kmod_tcpstat_inc(offsetof(struct tcpstat, name) / sizeof(uint64_t)) /* * Running TCP connection count by state. */ VNET_DECLARE(counter_u64_t, tcps_states[TCP_NSTATES]); #define V_tcps_states VNET(tcps_states) #define TCPSTATES_INC(state) counter_u64_add(V_tcps_states[state], 1) #define TCPSTATES_DEC(state) counter_u64_add(V_tcps_states[state], -1) /* * TCP specific helper hook point identifiers. */ #define HHOOK_TCP_EST_IN 0 #define HHOOK_TCP_EST_OUT 1 #define HHOOK_TCP_LAST HHOOK_TCP_EST_OUT struct tcp_hhook_data { struct tcpcb *tp; struct tcphdr *th; struct tcpopt *to; uint32_t len; int tso; tcp_seq curack; }; +#ifdef TCP_HHOOK +void hhook_run_tcp_est_out(struct tcpcb *tp, + struct tcphdr *th, struct tcpopt *to, + uint32_t len, int tso); #endif +#endif /* * TCB structure exported to user-land via sysctl(3). * * Fields prefixed with "xt_" are unique to the export structure, and fields * with "t_" or other prefixes match corresponding fields of 'struct tcpcb'. * * Legend: * (s) - used by userland utilities in src * (p) - used by utilities in ports * (3) - is known to be used by third party software not in ports * (n) - no known usage * * Evil hack: declare only if in_pcb.h and sys/socketvar.h have been * included. Not all of our clients do. */ #if defined(_NETINET_IN_PCB_H_) && defined(_SYS_SOCKETVAR_H_) struct xtcpcb { size_t xt_len; /* length of this structure */ struct xinpcb xt_inp; char xt_stack[TCP_FUNCTION_NAME_LEN_MAX]; /* (s) */ char xt_logid[TCP_LOG_ID_LEN]; /* (s) */ int64_t spare64[8]; int32_t t_state; /* (s,p) */ uint32_t t_flags; /* (s,p) */ int32_t t_sndzerowin; /* (s) */ int32_t t_sndrexmitpack; /* (s) */ int32_t t_rcvoopack; /* (s) */ int32_t t_rcvtime; /* (s) */ int32_t tt_rexmt; /* (s) */ int32_t tt_persist; /* (s) */ int32_t tt_keep; /* (s) */ int32_t tt_2msl; /* (s) */ int32_t tt_delack; /* (s) */ int32_t t_logstate; /* (3) */ int32_t spare32[32]; } __aligned(8); #ifdef _KERNEL void tcp_inptoxtp(const struct inpcb *, struct xtcpcb *); #endif #endif /* * TCP function information (name-to-id mapping, aliases, and refcnt) * exported to user-land via sysctl(3). */ struct tcp_function_info { uint32_t tfi_refcnt; uint8_t tfi_id; char tfi_name[TCP_FUNCTION_NAME_LEN_MAX]; char tfi_alias[TCP_FUNCTION_NAME_LEN_MAX]; }; /* * Identifiers for TCP sysctl nodes */ #define TCPCTL_DO_RFC1323 1 /* use RFC-1323 extensions */ #define TCPCTL_MSSDFLT 3 /* MSS default */ #define TCPCTL_STATS 4 /* statistics */ #define TCPCTL_RTTDFLT 5 /* default RTT estimate */ #define TCPCTL_KEEPIDLE 6 /* keepalive idle timer */ #define TCPCTL_KEEPINTVL 7 /* interval to send keepalives */ #define TCPCTL_SENDSPACE 8 /* send buffer space */ #define TCPCTL_RECVSPACE 9 /* receive buffer space */ #define TCPCTL_KEEPINIT 10 /* timeout for establishing syn */ #define TCPCTL_PCBLIST 11 /* list of all outstanding PCBs */ #define TCPCTL_DELACKTIME 12 /* time before sending delayed ACK */ #define TCPCTL_V6MSSDFLT 13 /* MSS default for IPv6 */ #define TCPCTL_SACK 14 /* Selective Acknowledgement,rfc 2018 */ #define TCPCTL_DROP 15 /* drop tcp connection */ #define TCPCTL_STATES 16 /* connection counts by TCP state */ #ifdef _KERNEL #ifdef SYSCTL_DECL SYSCTL_DECL(_net_inet_tcp); SYSCTL_DECL(_net_inet_tcp_sack); MALLOC_DECLARE(M_TCPLOG); #endif extern int tcp_log_in_vain; /* * Global TCP tunables shared between different stacks. * Please keep the list sorted. */ VNET_DECLARE(int, drop_synfin); VNET_DECLARE(int, path_mtu_discovery); VNET_DECLARE(int, tcp_abc_l_var); VNET_DECLARE(int, tcp_autorcvbuf_inc); VNET_DECLARE(int, tcp_autorcvbuf_max); VNET_DECLARE(int, tcp_autosndbuf_inc); VNET_DECLARE(int, tcp_autosndbuf_max); VNET_DECLARE(int, tcp_delack_enabled); VNET_DECLARE(int, tcp_do_autorcvbuf); VNET_DECLARE(int, tcp_do_autosndbuf); VNET_DECLARE(int, tcp_do_ecn); VNET_DECLARE(int, tcp_do_rfc1323); VNET_DECLARE(int, tcp_do_rfc3042); VNET_DECLARE(int, tcp_do_rfc3390); VNET_DECLARE(int, tcp_do_rfc3465); VNET_DECLARE(int, tcp_do_rfc6675_pipe); VNET_DECLARE(int, tcp_do_sack); VNET_DECLARE(int, tcp_do_tso); VNET_DECLARE(int, tcp_ecn_maxretries); VNET_DECLARE(int, tcp_initcwnd_segments); VNET_DECLARE(int, tcp_insecure_rst); VNET_DECLARE(int, tcp_insecure_syn); VNET_DECLARE(int, tcp_minmss); VNET_DECLARE(int, tcp_mssdflt); VNET_DECLARE(int, tcp_recvspace); VNET_DECLARE(int, tcp_sack_globalholes); VNET_DECLARE(int, tcp_sack_globalmaxholes); VNET_DECLARE(int, tcp_sack_maxholes); VNET_DECLARE(int, tcp_sc_rst_sock_fail); VNET_DECLARE(int, tcp_sendspace); VNET_DECLARE(struct inpcbhead, tcb); VNET_DECLARE(struct inpcbinfo, tcbinfo); #define V_drop_synfin VNET(drop_synfin) #define V_path_mtu_discovery VNET(path_mtu_discovery) #define V_tcb VNET(tcb) #define V_tcbinfo VNET(tcbinfo) #define V_tcp_abc_l_var VNET(tcp_abc_l_var) #define V_tcp_autorcvbuf_inc VNET(tcp_autorcvbuf_inc) #define V_tcp_autorcvbuf_max VNET(tcp_autorcvbuf_max) #define V_tcp_autosndbuf_inc VNET(tcp_autosndbuf_inc) #define V_tcp_autosndbuf_max VNET(tcp_autosndbuf_max) #define V_tcp_delack_enabled VNET(tcp_delack_enabled) #define V_tcp_do_autorcvbuf VNET(tcp_do_autorcvbuf) #define V_tcp_do_autosndbuf VNET(tcp_do_autosndbuf) #define V_tcp_do_ecn VNET(tcp_do_ecn) #define V_tcp_do_rfc1323 VNET(tcp_do_rfc1323) #define V_tcp_do_rfc3042 VNET(tcp_do_rfc3042) #define V_tcp_do_rfc3390 VNET(tcp_do_rfc3390) #define V_tcp_do_rfc3465 VNET(tcp_do_rfc3465) #define V_tcp_do_rfc6675_pipe VNET(tcp_do_rfc6675_pipe) #define V_tcp_do_sack VNET(tcp_do_sack) #define V_tcp_do_tso VNET(tcp_do_tso) #define V_tcp_ecn_maxretries VNET(tcp_ecn_maxretries) #define V_tcp_initcwnd_segments VNET(tcp_initcwnd_segments) #define V_tcp_insecure_rst VNET(tcp_insecure_rst) #define V_tcp_insecure_syn VNET(tcp_insecure_syn) #define V_tcp_minmss VNET(tcp_minmss) #define V_tcp_mssdflt VNET(tcp_mssdflt) #define V_tcp_recvspace VNET(tcp_recvspace) #define V_tcp_sack_globalholes VNET(tcp_sack_globalholes) #define V_tcp_sack_globalmaxholes VNET(tcp_sack_globalmaxholes) #define V_tcp_sack_maxholes VNET(tcp_sack_maxholes) #define V_tcp_sc_rst_sock_fail VNET(tcp_sc_rst_sock_fail) #define V_tcp_sendspace VNET(tcp_sendspace) +#define V_tcp_udp_tunneling_overhead VNET(tcp_udp_tunneling_overhead) +#define V_tcp_udp_tunneling_port VNET(tcp_udp_tunneling_port) + #ifdef TCP_HHOOK VNET_DECLARE(struct hhook_head *, tcp_hhh[HHOOK_TCP_LAST + 1]); #define V_tcp_hhh VNET(tcp_hhh) #endif int tcp_addoptions(struct tcpopt *, u_char *); int tcp_ccalgounload(struct cc_algo *unload_algo); struct tcpcb * tcp_close(struct tcpcb *); void tcp_discardcb(struct tcpcb *); void tcp_twstart(struct tcpcb *); void tcp_twclose(struct tcptw *, int); void tcp_ctlinput(int, struct sockaddr *, void *); int tcp_ctloutput(struct socket *, struct sockopt *); struct tcpcb * tcp_drop(struct tcpcb *, int); void tcp_drain(void); void tcp_init(void); void tcp_fini(void *); char *tcp_log_addrs(struct in_conninfo *, struct tcphdr *, void *, const void *); char *tcp_log_vain(struct in_conninfo *, struct tcphdr *, void *, const void *); int tcp_reass(struct tcpcb *, struct tcphdr *, int *, struct mbuf *); void tcp_reass_global_init(void); void tcp_reass_flush(struct tcpcb *); void tcp_dooptions(struct tcpopt *, u_char *, int, int); void tcp_dropwithreset(struct mbuf *, struct tcphdr *, struct tcpcb *, int, int); void tcp_pulloutofband(struct socket *, struct tcphdr *, struct mbuf *, int); void tcp_xmit_timer(struct tcpcb *, int); void tcp_newreno_partial_ack(struct tcpcb *, struct tcphdr *); void cc_ack_received(struct tcpcb *tp, struct tcphdr *th, uint16_t nsegs, uint16_t type); void cc_conn_init(struct tcpcb *tp); void cc_post_recovery(struct tcpcb *tp, struct tcphdr *th); void cc_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type); #ifdef TCP_HHOOK void hhook_run_tcp_est_in(struct tcpcb *tp, struct tcphdr *th, struct tcpopt *to); #endif int tcp_input(struct mbuf **, int *, int); int tcp_autorcvbuf(struct mbuf *, struct tcphdr *, struct socket *, struct tcpcb *, int); void tcp_do_segment(struct mbuf *, struct tcphdr *, struct socket *, struct tcpcb *, int, int, uint8_t, int); int register_tcp_functions(struct tcp_function_block *blk, int wait); int register_tcp_functions_as_names(struct tcp_function_block *blk, int wait, const char *names[], int *num_names); int register_tcp_functions_as_name(struct tcp_function_block *blk, const char *name, int wait); int deregister_tcp_functions(struct tcp_function_block *blk, bool quiesce, bool force); struct tcp_function_block *find_and_ref_tcp_functions(struct tcp_function_set *fs); void tcp_switch_back_to_default(struct tcpcb *tp); struct tcp_function_block * find_and_ref_tcp_fb(struct tcp_function_block *fs); int tcp_default_ctloutput(struct socket *so, struct sockopt *sopt, struct inpcb *inp, struct tcpcb *tp); uint32_t tcp_maxmtu(struct in_conninfo *, struct tcp_ifcap *); uint32_t tcp_maxmtu6(struct in_conninfo *, struct tcp_ifcap *); u_int tcp_maxseg(const struct tcpcb *); void tcp_mss_update(struct tcpcb *, int, int, struct hc_metrics_lite *, struct tcp_ifcap *); void tcp_mss(struct tcpcb *, int); int tcp_mssopt(struct in_conninfo *); struct inpcb * tcp_drop_syn_sent(struct inpcb *, int); struct tcpcb * tcp_newtcpcb(struct inpcb *); int tcp_output(struct tcpcb *); void tcp_state_change(struct tcpcb *, int); void tcp_respond(struct tcpcb *, void *, struct tcphdr *, struct mbuf *, tcp_seq, tcp_seq, int); void tcp_tw_init(void); #ifdef VIMAGE void tcp_tw_destroy(void); #endif void tcp_tw_zone_change(void); int tcp_twcheck(struct inpcb *, struct tcpopt *, struct tcphdr *, struct mbuf *, int); void tcp_setpersist(struct tcpcb *); void tcp_slowtimo(void); struct tcptemp * tcpip_maketemplate(struct inpcb *); void tcpip_fillheaders(struct inpcb *, void *, void *); void tcp_timer_activate(struct tcpcb *, uint32_t, u_int); +int tcp_timer_suspend(struct tcpcb *, uint32_t); +void tcp_timers_unsuspend(struct tcpcb *, uint32_t); int tcp_timer_active(struct tcpcb *, uint32_t); void tcp_timer_stop(struct tcpcb *, uint32_t); void tcp_trace(short, short, struct tcpcb *, void *, struct tcphdr *, int); +int inp_to_cpuid(struct inpcb *inp); /* * All tcp_hc_* functions are IPv4 and IPv6 (via in_conninfo) */ void tcp_hc_init(void); #ifdef VIMAGE void tcp_hc_destroy(void); #endif void tcp_hc_get(struct in_conninfo *, struct hc_metrics_lite *); uint32_t tcp_hc_getmtu(struct in_conninfo *); void tcp_hc_updatemtu(struct in_conninfo *, uint32_t); void tcp_hc_update(struct in_conninfo *, struct hc_metrics_lite *); extern struct pr_usrreqs tcp_usrreqs; tcp_seq tcp_new_isn(struct tcpcb *); int tcp_sack_doack(struct tcpcb *, struct tcpopt *, tcp_seq); void tcp_update_sack_list(struct tcpcb *tp, tcp_seq rcv_laststart, tcp_seq rcv_lastend); void tcp_clean_sackreport(struct tcpcb *tp); void tcp_sack_adjust(struct tcpcb *tp); struct sackhole *tcp_sack_output(struct tcpcb *tp, int *sack_bytes_rexmt); void tcp_sack_partialack(struct tcpcb *, struct tcphdr *); void tcp_free_sackholes(struct tcpcb *tp); int tcp_newreno(struct tcpcb *, struct tcphdr *); int tcp_compute_pipe(struct tcpcb *); void tcp_sndbuf_autoscale(struct tcpcb *, struct socket *, uint32_t); +struct mbuf * + tcp_m_copym(struct mbuf *m, int32_t off0, int32_t *plen, + int32_t seglimit, int32_t segsize, struct sockbuf *sb); + static inline void tcp_fields_to_host(struct tcphdr *th) { th->th_seq = ntohl(th->th_seq); th->th_ack = ntohl(th->th_ack); th->th_win = ntohs(th->th_win); th->th_urp = ntohs(th->th_urp); } static inline void tcp_fields_to_net(struct tcphdr *th) { th->th_seq = htonl(th->th_seq); th->th_ack = htonl(th->th_ack); th->th_win = htons(th->th_win); th->th_urp = htons(th->th_urp); } #endif /* _KERNEL */ #endif /* _NETINET_TCP_VAR_H_ */ Index: head/sys/sys/mbuf.h =================================================================== --- head/sys/sys/mbuf.h (revision 334803) +++ head/sys/sys/mbuf.h (revision 334804) @@ -1,1388 +1,1388 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1988, 1993 * The Regents of the University of California. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)mbuf.h 8.5 (Berkeley) 2/19/95 * $FreeBSD$ */ #ifndef _SYS_MBUF_H_ #define _SYS_MBUF_H_ /* XXX: These includes suck. Sorry! */ #include #ifdef _KERNEL #include #include #ifdef WITNESS #include #endif #endif #ifdef _KERNEL #include #define MBUF_PROBE1(probe, arg0) \ SDT_PROBE1(sdt, , , probe, arg0) #define MBUF_PROBE2(probe, arg0, arg1) \ SDT_PROBE2(sdt, , , probe, arg0, arg1) #define MBUF_PROBE3(probe, arg0, arg1, arg2) \ SDT_PROBE3(sdt, , , probe, arg0, arg1, arg2) #define MBUF_PROBE4(probe, arg0, arg1, arg2, arg3) \ SDT_PROBE4(sdt, , , probe, arg0, arg1, arg2, arg3) #define MBUF_PROBE5(probe, arg0, arg1, arg2, arg3, arg4) \ SDT_PROBE5(sdt, , , probe, arg0, arg1, arg2, arg3, arg4) SDT_PROBE_DECLARE(sdt, , , m__init); SDT_PROBE_DECLARE(sdt, , , m__gethdr); SDT_PROBE_DECLARE(sdt, , , m__get); SDT_PROBE_DECLARE(sdt, , , m__getcl); SDT_PROBE_DECLARE(sdt, , , m__clget); SDT_PROBE_DECLARE(sdt, , , m__cljget); SDT_PROBE_DECLARE(sdt, , , m__cljset); SDT_PROBE_DECLARE(sdt, , , m__free); SDT_PROBE_DECLARE(sdt, , , m__freem); #endif /* _KERNEL */ /* * Mbufs are of a single size, MSIZE (sys/param.h), which includes overhead. * An mbuf may add a single "mbuf cluster" of size MCLBYTES (also in * sys/param.h), which has no additional overhead and is used instead of the * internal data area; this is done when at least MINCLSIZE of data must be * stored. Additionally, it is possible to allocate a separate buffer * externally and attach it to the mbuf in a way similar to that of mbuf * clusters. * * NB: These calculation do not take actual compiler-induced alignment and * padding inside the complete struct mbuf into account. Appropriate * attention is required when changing members of struct mbuf. * * MLEN is data length in a normal mbuf. * MHLEN is data length in an mbuf with pktheader. * MINCLSIZE is a smallest amount of data that should be put into cluster. * * Compile-time assertions in uipc_mbuf.c test these values to ensure that * they are sensible. */ struct mbuf; #define MHSIZE offsetof(struct mbuf, m_dat) #define MPKTHSIZE offsetof(struct mbuf, m_pktdat) #define MLEN ((int)(MSIZE - MHSIZE)) #define MHLEN ((int)(MSIZE - MPKTHSIZE)) #define MINCLSIZE (MHLEN + 1) #ifdef _KERNEL /*- * Macro for type conversion: convert mbuf pointer to data pointer of correct * type: * * mtod(m, t) -- Convert mbuf pointer to data pointer of correct type. * mtodo(m, o) -- Same as above but with offset 'o' into data. */ #define mtod(m, t) ((t)((m)->m_data)) #define mtodo(m, o) ((void *)(((m)->m_data) + (o))) /* * Argument structure passed to UMA routines during mbuf and packet * allocations. */ struct mb_args { int flags; /* Flags for mbuf being allocated */ short type; /* Type of mbuf being allocated */ }; #endif /* _KERNEL */ /* * Packet tag structure (see below for details). */ struct m_tag { SLIST_ENTRY(m_tag) m_tag_link; /* List of packet tags */ u_int16_t m_tag_id; /* Tag ID */ u_int16_t m_tag_len; /* Length of data */ u_int32_t m_tag_cookie; /* ABI/Module ID */ void (*m_tag_free)(struct m_tag *); }; /* * Static network interface owned tag. * Allocated through ifp->if_snd_tag_alloc(). */ struct m_snd_tag { struct ifnet *ifp; /* network interface tag belongs to */ }; /* * Record/packet header in first mbuf of chain; valid only if M_PKTHDR is set. * Size ILP32: 48 * LP64: 56 * Compile-time assertions in uipc_mbuf.c test these values to ensure that * they are correct. */ struct pkthdr { union { struct m_snd_tag *snd_tag; /* send tag, if any */ struct ifnet *rcvif; /* rcv interface */ }; SLIST_HEAD(packet_tags, m_tag) tags; /* list of packet tags */ int32_t len; /* total packet length */ /* Layer crossing persistent information. */ uint32_t flowid; /* packet's 4-tuple system */ uint32_t csum_flags; /* checksum and offload features */ uint16_t fibnum; /* this packet should use this fib */ uint8_t cosqos; /* class/quality of service */ uint8_t rsstype; /* hash type */ union { uint64_t rcv_tstmp; /* timestamp in ns */ struct { uint8_t l2hlen; /* layer 2 hdr len */ uint8_t l3hlen; /* layer 3 hdr len */ uint8_t l4hlen; /* layer 4 hdr len */ uint8_t l5hlen; /* layer 5 hdr len */ uint32_t spare; }; }; union { uint8_t eight[8]; uint16_t sixteen[4]; uint32_t thirtytwo[2]; uint64_t sixtyfour[1]; uintptr_t unintptr[1]; void *ptr; } PH_per; /* Layer specific non-persistent local storage for reassembly, etc. */ union { uint8_t eight[8]; uint16_t sixteen[4]; uint32_t thirtytwo[2]; uint64_t sixtyfour[1]; uintptr_t unintptr[1]; void *ptr; } PH_loc; }; #define ether_vtag PH_per.sixteen[0] #define PH_vt PH_per #define vt_nrecs sixteen[0] #define tso_segsz PH_per.sixteen[1] #define lro_nsegs tso_segsz #define csum_phsum PH_per.sixteen[2] #define csum_data PH_per.thirtytwo[1] #define pace_thoff PH_loc.sixteen[0] #define pace_tlen PH_loc.sixteen[1] #define pace_drphdrlen PH_loc.sixteen[2] #define pace_tos PH_loc.eight[6] #define pace_lock PH_loc.eight[7] /* * Description of external storage mapped into mbuf; valid only if M_EXT is * set. * Size ILP32: 28 * LP64: 48 * Compile-time assertions in uipc_mbuf.c test these values to ensure that * they are correct. */ typedef void m_ext_free_t(struct mbuf *); struct m_ext { union { /* * If EXT_FLAG_EMBREF is set, then we use refcount in the * mbuf, the 'ext_count' member. Otherwise, we have a * shadow copy and we use pointer 'ext_cnt'. The original * mbuf is responsible to carry the pointer to free routine * and its arguments. They aren't copied into shadows in * mb_dupcl() to avoid dereferencing next cachelines. */ volatile u_int ext_count; volatile u_int *ext_cnt; }; char *ext_buf; /* start of buffer */ uint32_t ext_size; /* size of buffer, for ext_free */ uint32_t ext_type:8, /* type of external storage */ ext_flags:24; /* external storage mbuf flags */ /* * Fields below store the free context for the external storage. * They are valid only in the refcount carrying mbuf, the one with * EXT_FLAG_EMBREF flag, with exclusion for EXT_EXTREF type, where * the free context is copied into all mbufs that use same external * storage. */ #define m_ext_copylen offsetof(struct m_ext, ext_free) m_ext_free_t *ext_free; /* free routine if not the usual */ void *ext_arg1; /* optional argument pointer */ void *ext_arg2; /* optional argument pointer */ }; /* * The core of the mbuf object along with some shortcut defines for practical * purposes. */ struct mbuf { /* * Header present at the beginning of every mbuf. * Size ILP32: 24 * LP64: 32 * Compile-time assertions in uipc_mbuf.c test these values to ensure * that they are correct. */ union { /* next buffer in chain */ struct mbuf *m_next; SLIST_ENTRY(mbuf) m_slist; STAILQ_ENTRY(mbuf) m_stailq; }; union { /* next chain in queue/record */ struct mbuf *m_nextpkt; SLIST_ENTRY(mbuf) m_slistpkt; STAILQ_ENTRY(mbuf) m_stailqpkt; }; caddr_t m_data; /* location of data */ int32_t m_len; /* amount of data in this mbuf */ uint32_t m_type:8, /* type of data in this mbuf */ m_flags:24; /* flags; see below */ #if !defined(__LP64__) uint32_t m_pad; /* pad for 64bit alignment */ #endif /* * A set of optional headers (packet header, external storage header) * and internal data storage. Historically, these arrays were sized * to MHLEN (space left after a packet header) and MLEN (space left * after only a regular mbuf header); they are now variable size in * order to support future work on variable-size mbufs. */ union { struct { struct pkthdr m_pkthdr; /* M_PKTHDR set */ union { struct m_ext m_ext; /* M_EXT set */ char m_pktdat[0]; }; }; char m_dat[0]; /* !M_PKTHDR, !M_EXT */ }; }; /* * mbuf flags of global significance and layer crossing. * Those of only protocol/layer specific significance are to be mapped * to M_PROTO[1-12] and cleared at layer handoff boundaries. * NB: Limited to the lower 24 bits. */ #define M_EXT 0x00000001 /* has associated external storage */ #define M_PKTHDR 0x00000002 /* start of record */ #define M_EOR 0x00000004 /* end of record */ #define M_RDONLY 0x00000008 /* associated data is marked read-only */ #define M_BCAST 0x00000010 /* send/received as link-level broadcast */ #define M_MCAST 0x00000020 /* send/received as link-level multicast */ #define M_PROMISC 0x00000040 /* packet was not for us */ #define M_VLANTAG 0x00000080 /* ether_vtag is valid */ -#define M_UNUSED_8 0x00000100 /* --available-- */ +#define M_NOMAP 0x00000100 /* mbuf data is unmapped (soon from Drew) */ #define M_NOFREE 0x00000200 /* do not free mbuf, embedded in cluster */ #define M_TSTMP 0x00000400 /* rcv_tstmp field is valid */ #define M_TSTMP_HPREC 0x00000800 /* rcv_tstmp is high-prec, typically hw-stamped on port (useful for IEEE 1588 and 802.1AS) */ #define M_PROTO1 0x00001000 /* protocol-specific */ #define M_PROTO2 0x00002000 /* protocol-specific */ #define M_PROTO3 0x00004000 /* protocol-specific */ #define M_PROTO4 0x00008000 /* protocol-specific */ #define M_PROTO5 0x00010000 /* protocol-specific */ #define M_PROTO6 0x00020000 /* protocol-specific */ #define M_PROTO7 0x00040000 /* protocol-specific */ #define M_PROTO8 0x00080000 /* protocol-specific */ #define M_PROTO9 0x00100000 /* protocol-specific */ #define M_PROTO10 0x00200000 /* protocol-specific */ #define M_PROTO11 0x00400000 /* protocol-specific */ #define M_PROTO12 0x00800000 /* protocol-specific */ #define MB_DTOR_SKIP 0x1 /* don't pollute the cache by touching a freed mbuf */ /* * Flags to purge when crossing layers. */ #define M_PROTOFLAGS \ (M_PROTO1|M_PROTO2|M_PROTO3|M_PROTO4|M_PROTO5|M_PROTO6|M_PROTO7|M_PROTO8|\ M_PROTO9|M_PROTO10|M_PROTO11|M_PROTO12) /* * Flags preserved when copying m_pkthdr. */ #define M_COPYFLAGS \ (M_PKTHDR|M_EOR|M_RDONLY|M_BCAST|M_MCAST|M_PROMISC|M_VLANTAG|M_TSTMP| \ M_TSTMP_HPREC|M_PROTOFLAGS) /* * Mbuf flag description for use with printf(9) %b identifier. */ #define M_FLAG_BITS \ "\20\1M_EXT\2M_PKTHDR\3M_EOR\4M_RDONLY\5M_BCAST\6M_MCAST" \ "\7M_PROMISC\10M_VLANTAG\13M_TSTMP\14M_TSTMP_HPREC" #define M_FLAG_PROTOBITS \ "\15M_PROTO1\16M_PROTO2\17M_PROTO3\20M_PROTO4\21M_PROTO5" \ "\22M_PROTO6\23M_PROTO7\24M_PROTO8\25M_PROTO9\26M_PROTO10" \ "\27M_PROTO11\30M_PROTO12" #define M_FLAG_PRINTF (M_FLAG_BITS M_FLAG_PROTOBITS) /* * Network interface cards are able to hash protocol fields (such as IPv4 * addresses and TCP port numbers) classify packets into flows. These flows * can then be used to maintain ordering while delivering packets to the OS * via parallel input queues, as well as to provide a stateless affinity * model. NIC drivers can pass up the hash via m->m_pkthdr.flowid, and set * m_flag fields to indicate how the hash should be interpreted by the * network stack. * * Most NICs support RSS, which provides ordering and explicit affinity, and * use the hash m_flag bits to indicate what header fields were covered by * the hash. M_HASHTYPE_OPAQUE and M_HASHTYPE_OPAQUE_HASH can be set by non- * RSS cards or configurations that provide an opaque flow identifier, allowing * for ordering and distribution without explicit affinity. Additionally, * M_HASHTYPE_OPAQUE_HASH indicates that the flow identifier has hash * properties. * * The meaning of the IPV6_EX suffix: * "o Home address from the home address option in the IPv6 destination * options header. If the extension header is not present, use the Source * IPv6 Address. * o IPv6 address that is contained in the Routing-Header-Type-2 from the * associated extension header. If the extension header is not present, * use the Destination IPv6 Address." * Quoted from: * https://docs.microsoft.com/en-us/windows-hardware/drivers/network/rss-hashing-types#ndishashipv6ex */ #define M_HASHTYPE_HASHPROP 0x80 /* has hash properties */ #define M_HASHTYPE_HASH(t) (M_HASHTYPE_HASHPROP | (t)) /* Microsoft RSS standard hash types */ #define M_HASHTYPE_NONE 0 #define M_HASHTYPE_RSS_IPV4 M_HASHTYPE_HASH(1) /* IPv4 2-tuple */ #define M_HASHTYPE_RSS_TCP_IPV4 M_HASHTYPE_HASH(2) /* TCPv4 4-tuple */ #define M_HASHTYPE_RSS_IPV6 M_HASHTYPE_HASH(3) /* IPv6 2-tuple */ #define M_HASHTYPE_RSS_TCP_IPV6 M_HASHTYPE_HASH(4) /* TCPv6 4-tuple */ #define M_HASHTYPE_RSS_IPV6_EX M_HASHTYPE_HASH(5) /* IPv6 2-tuple + * ext hdrs */ #define M_HASHTYPE_RSS_TCP_IPV6_EX M_HASHTYPE_HASH(6) /* TCPv6 4-tuple + * ext hdrs */ #define M_HASHTYPE_RSS_UDP_IPV4 M_HASHTYPE_HASH(7) /* IPv4 UDP 4-tuple*/ #define M_HASHTYPE_RSS_UDP_IPV6 M_HASHTYPE_HASH(9) /* IPv6 UDP 4-tuple*/ #define M_HASHTYPE_RSS_UDP_IPV6_EX M_HASHTYPE_HASH(10)/* IPv6 UDP 4-tuple + * ext hdrs */ #define M_HASHTYPE_OPAQUE 63 /* ordering, not affinity */ #define M_HASHTYPE_OPAQUE_HASH M_HASHTYPE_HASH(M_HASHTYPE_OPAQUE) /* ordering+hash, not affinity*/ #define M_HASHTYPE_CLEAR(m) ((m)->m_pkthdr.rsstype = 0) #define M_HASHTYPE_GET(m) ((m)->m_pkthdr.rsstype) #define M_HASHTYPE_SET(m, v) ((m)->m_pkthdr.rsstype = (v)) #define M_HASHTYPE_TEST(m, v) (M_HASHTYPE_GET(m) == (v)) #define M_HASHTYPE_ISHASH(m) (M_HASHTYPE_GET(m) & M_HASHTYPE_HASHPROP) /* * COS/QOS class and quality of service tags. * It uses DSCP code points as base. */ #define QOS_DSCP_CS0 0x00 #define QOS_DSCP_DEF QOS_DSCP_CS0 #define QOS_DSCP_CS1 0x20 #define QOS_DSCP_AF11 0x28 #define QOS_DSCP_AF12 0x30 #define QOS_DSCP_AF13 0x38 #define QOS_DSCP_CS2 0x40 #define QOS_DSCP_AF21 0x48 #define QOS_DSCP_AF22 0x50 #define QOS_DSCP_AF23 0x58 #define QOS_DSCP_CS3 0x60 #define QOS_DSCP_AF31 0x68 #define QOS_DSCP_AF32 0x70 #define QOS_DSCP_AF33 0x78 #define QOS_DSCP_CS4 0x80 #define QOS_DSCP_AF41 0x88 #define QOS_DSCP_AF42 0x90 #define QOS_DSCP_AF43 0x98 #define QOS_DSCP_CS5 0xa0 #define QOS_DSCP_EF 0xb8 #define QOS_DSCP_CS6 0xc0 #define QOS_DSCP_CS7 0xe0 /* * External mbuf storage buffer types. */ #define EXT_CLUSTER 1 /* mbuf cluster */ #define EXT_SFBUF 2 /* sendfile(2)'s sf_buf */ #define EXT_JUMBOP 3 /* jumbo cluster page sized */ #define EXT_JUMBO9 4 /* jumbo cluster 9216 bytes */ #define EXT_JUMBO16 5 /* jumbo cluster 16184 bytes */ #define EXT_PACKET 6 /* mbuf+cluster from packet zone */ #define EXT_MBUF 7 /* external mbuf reference */ #define EXT_VENDOR1 224 /* for vendor-internal use */ #define EXT_VENDOR2 225 /* for vendor-internal use */ #define EXT_VENDOR3 226 /* for vendor-internal use */ #define EXT_VENDOR4 227 /* for vendor-internal use */ #define EXT_EXP1 244 /* for experimental use */ #define EXT_EXP2 245 /* for experimental use */ #define EXT_EXP3 246 /* for experimental use */ #define EXT_EXP4 247 /* for experimental use */ #define EXT_NET_DRV 252 /* custom ext_buf provided by net driver(s) */ #define EXT_MOD_TYPE 253 /* custom module's ext_buf type */ #define EXT_DISPOSABLE 254 /* can throw this buffer away w/page flipping */ #define EXT_EXTREF 255 /* has externally maintained ext_cnt ptr */ /* * Flags for external mbuf buffer types. * NB: limited to the lower 24 bits. */ #define EXT_FLAG_EMBREF 0x000001 /* embedded ext_count */ #define EXT_FLAG_EXTREF 0x000002 /* external ext_cnt, notyet */ #define EXT_FLAG_NOFREE 0x000010 /* don't free mbuf to pool, notyet */ #define EXT_FLAG_VENDOR1 0x010000 /* These flags are vendor */ #define EXT_FLAG_VENDOR2 0x020000 /* or submodule specific, */ #define EXT_FLAG_VENDOR3 0x040000 /* not used by mbuf code. */ #define EXT_FLAG_VENDOR4 0x080000 /* Set/read by submodule. */ #define EXT_FLAG_EXP1 0x100000 /* for experimental use */ #define EXT_FLAG_EXP2 0x200000 /* for experimental use */ #define EXT_FLAG_EXP3 0x400000 /* for experimental use */ #define EXT_FLAG_EXP4 0x800000 /* for experimental use */ /* * EXT flag description for use with printf(9) %b identifier. */ #define EXT_FLAG_BITS \ "\20\1EXT_FLAG_EMBREF\2EXT_FLAG_EXTREF\5EXT_FLAG_NOFREE" \ "\21EXT_FLAG_VENDOR1\22EXT_FLAG_VENDOR2\23EXT_FLAG_VENDOR3" \ "\24EXT_FLAG_VENDOR4\25EXT_FLAG_EXP1\26EXT_FLAG_EXP2\27EXT_FLAG_EXP3" \ "\30EXT_FLAG_EXP4" /* * Flags indicating checksum, segmentation and other offload work to be * done, or already done, by hardware or lower layers. It is split into * separate inbound and outbound flags. * * Outbound flags that are set by upper protocol layers requesting lower * layers, or ideally the hardware, to perform these offloading tasks. * For outbound packets this field and its flags can be directly tested * against ifnet if_hwassist. */ #define CSUM_IP 0x00000001 /* IP header checksum offload */ #define CSUM_IP_UDP 0x00000002 /* UDP checksum offload */ #define CSUM_IP_TCP 0x00000004 /* TCP checksum offload */ #define CSUM_IP_SCTP 0x00000008 /* SCTP checksum offload */ #define CSUM_IP_TSO 0x00000010 /* TCP segmentation offload */ #define CSUM_IP_ISCSI 0x00000020 /* iSCSI checksum offload */ #define CSUM_IP6_UDP 0x00000200 /* UDP checksum offload */ #define CSUM_IP6_TCP 0x00000400 /* TCP checksum offload */ #define CSUM_IP6_SCTP 0x00000800 /* SCTP checksum offload */ #define CSUM_IP6_TSO 0x00001000 /* TCP segmentation offload */ #define CSUM_IP6_ISCSI 0x00002000 /* iSCSI checksum offload */ /* Inbound checksum support where the checksum was verified by hardware. */ #define CSUM_L3_CALC 0x01000000 /* calculated layer 3 csum */ #define CSUM_L3_VALID 0x02000000 /* checksum is correct */ #define CSUM_L4_CALC 0x04000000 /* calculated layer 4 csum */ #define CSUM_L4_VALID 0x08000000 /* checksum is correct */ #define CSUM_L5_CALC 0x10000000 /* calculated layer 5 csum */ #define CSUM_L5_VALID 0x20000000 /* checksum is correct */ #define CSUM_COALESCED 0x40000000 /* contains merged segments */ /* * CSUM flag description for use with printf(9) %b identifier. */ #define CSUM_BITS \ "\20\1CSUM_IP\2CSUM_IP_UDP\3CSUM_IP_TCP\4CSUM_IP_SCTP\5CSUM_IP_TSO" \ "\6CSUM_IP_ISCSI" \ "\12CSUM_IP6_UDP\13CSUM_IP6_TCP\14CSUM_IP6_SCTP\15CSUM_IP6_TSO" \ "\16CSUM_IP6_ISCSI" \ "\31CSUM_L3_CALC\32CSUM_L3_VALID\33CSUM_L4_CALC\34CSUM_L4_VALID" \ "\35CSUM_L5_CALC\36CSUM_L5_VALID\37CSUM_COALESCED" /* CSUM flags compatibility mappings. */ #define CSUM_IP_CHECKED CSUM_L3_CALC #define CSUM_IP_VALID CSUM_L3_VALID #define CSUM_DATA_VALID CSUM_L4_VALID #define CSUM_PSEUDO_HDR CSUM_L4_CALC #define CSUM_SCTP_VALID CSUM_L4_VALID #define CSUM_DELAY_DATA (CSUM_TCP|CSUM_UDP) #define CSUM_DELAY_IP CSUM_IP /* Only v4, no v6 IP hdr csum */ #define CSUM_DELAY_DATA_IPV6 (CSUM_TCP_IPV6|CSUM_UDP_IPV6) #define CSUM_DATA_VALID_IPV6 CSUM_DATA_VALID #define CSUM_TCP CSUM_IP_TCP #define CSUM_UDP CSUM_IP_UDP #define CSUM_SCTP CSUM_IP_SCTP #define CSUM_TSO (CSUM_IP_TSO|CSUM_IP6_TSO) #define CSUM_UDP_IPV6 CSUM_IP6_UDP #define CSUM_TCP_IPV6 CSUM_IP6_TCP #define CSUM_SCTP_IPV6 CSUM_IP6_SCTP /* * mbuf types describing the content of the mbuf (including external storage). */ #define MT_NOTMBUF 0 /* USED INTERNALLY ONLY! Object is not mbuf */ #define MT_DATA 1 /* dynamic (data) allocation */ #define MT_HEADER MT_DATA /* packet header, use M_PKTHDR instead */ #define MT_VENDOR1 4 /* for vendor-internal use */ #define MT_VENDOR2 5 /* for vendor-internal use */ #define MT_VENDOR3 6 /* for vendor-internal use */ #define MT_VENDOR4 7 /* for vendor-internal use */ #define MT_SONAME 8 /* socket name */ #define MT_EXP1 9 /* for experimental use */ #define MT_EXP2 10 /* for experimental use */ #define MT_EXP3 11 /* for experimental use */ #define MT_EXP4 12 /* for experimental use */ #define MT_CONTROL 14 /* extra-data protocol message */ #define MT_OOBDATA 15 /* expedited data */ #define MT_NTYPES 16 /* number of mbuf types for mbtypes[] */ #define MT_NOINIT 255 /* Not a type but a flag to allocate a non-initialized mbuf */ /* * String names of mbuf-related UMA(9) and malloc(9) types. Exposed to * !_KERNEL so that monitoring tools can look up the zones with * libmemstat(3). */ #define MBUF_MEM_NAME "mbuf" #define MBUF_CLUSTER_MEM_NAME "mbuf_cluster" #define MBUF_PACKET_MEM_NAME "mbuf_packet" #define MBUF_JUMBOP_MEM_NAME "mbuf_jumbo_page" #define MBUF_JUMBO9_MEM_NAME "mbuf_jumbo_9k" #define MBUF_JUMBO16_MEM_NAME "mbuf_jumbo_16k" #define MBUF_TAG_MEM_NAME "mbuf_tag" #define MBUF_EXTREFCNT_MEM_NAME "mbuf_ext_refcnt" #ifdef _KERNEL #ifdef WITNESS #define MBUF_CHECKSLEEP(how) do { \ if (how == M_WAITOK) \ WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, \ "Sleeping in \"%s\"", __func__); \ } while (0) #else #define MBUF_CHECKSLEEP(how) #endif /* * Network buffer allocation API * * The rest of it is defined in kern/kern_mbuf.c */ extern uma_zone_t zone_mbuf; extern uma_zone_t zone_clust; extern uma_zone_t zone_pack; extern uma_zone_t zone_jumbop; extern uma_zone_t zone_jumbo9; extern uma_zone_t zone_jumbo16; void mb_dupcl(struct mbuf *, struct mbuf *); void mb_free_ext(struct mbuf *); void m_adj(struct mbuf *, int); int m_apply(struct mbuf *, int, int, int (*)(void *, void *, u_int), void *); int m_append(struct mbuf *, int, c_caddr_t); void m_cat(struct mbuf *, struct mbuf *); void m_catpkt(struct mbuf *, struct mbuf *); int m_clget(struct mbuf *m, int how); void *m_cljget(struct mbuf *m, int how, int size); struct mbuf *m_collapse(struct mbuf *, int, int); void m_copyback(struct mbuf *, int, int, c_caddr_t); void m_copydata(const struct mbuf *, int, int, caddr_t); struct mbuf *m_copym(struct mbuf *, int, int, int); struct mbuf *m_copypacket(struct mbuf *, int); void m_copy_pkthdr(struct mbuf *, struct mbuf *); struct mbuf *m_copyup(struct mbuf *, int, int); struct mbuf *m_defrag(struct mbuf *, int); void m_demote_pkthdr(struct mbuf *); void m_demote(struct mbuf *, int, int); struct mbuf *m_devget(char *, int, int, struct ifnet *, void (*)(char *, caddr_t, u_int)); struct mbuf *m_dup(const struct mbuf *, int); int m_dup_pkthdr(struct mbuf *, const struct mbuf *, int); void m_extadd(struct mbuf *, char *, u_int, m_ext_free_t, void *, void *, int, int); u_int m_fixhdr(struct mbuf *); struct mbuf *m_fragment(struct mbuf *, int, int); void m_freem(struct mbuf *); struct mbuf *m_get2(int, int, short, int); struct mbuf *m_getjcl(int, short, int, int); struct mbuf *m_getm2(struct mbuf *, int, int, short, int); struct mbuf *m_getptr(struct mbuf *, int, int *); u_int m_length(struct mbuf *, struct mbuf **); int m_mbuftouio(struct uio *, const struct mbuf *, int); void m_move_pkthdr(struct mbuf *, struct mbuf *); int m_pkthdr_init(struct mbuf *, int); struct mbuf *m_prepend(struct mbuf *, int, int); void m_print(const struct mbuf *, int); struct mbuf *m_pulldown(struct mbuf *, int, int, int *); struct mbuf *m_pullup(struct mbuf *, int); int m_sanity(struct mbuf *, int); struct mbuf *m_split(struct mbuf *, int, int); struct mbuf *m_uiotombuf(struct uio *, int, int, int, int); struct mbuf *m_unshare(struct mbuf *, int); static __inline int m_gettype(int size) { int type; switch (size) { case MSIZE: type = EXT_MBUF; break; case MCLBYTES: type = EXT_CLUSTER; break; #if MJUMPAGESIZE != MCLBYTES case MJUMPAGESIZE: type = EXT_JUMBOP; break; #endif case MJUM9BYTES: type = EXT_JUMBO9; break; case MJUM16BYTES: type = EXT_JUMBO16; break; default: panic("%s: invalid cluster size %d", __func__, size); } return (type); } /* * Associated an external reference counted buffer with an mbuf. */ static __inline void m_extaddref(struct mbuf *m, char *buf, u_int size, u_int *ref_cnt, m_ext_free_t freef, void *arg1, void *arg2) { KASSERT(ref_cnt != NULL, ("%s: ref_cnt not provided", __func__)); atomic_add_int(ref_cnt, 1); m->m_flags |= M_EXT; m->m_ext.ext_buf = buf; m->m_ext.ext_cnt = ref_cnt; m->m_data = m->m_ext.ext_buf; m->m_ext.ext_size = size; m->m_ext.ext_free = freef; m->m_ext.ext_arg1 = arg1; m->m_ext.ext_arg2 = arg2; m->m_ext.ext_type = EXT_EXTREF; m->m_ext.ext_flags = 0; } static __inline uma_zone_t m_getzone(int size) { uma_zone_t zone; switch (size) { case MCLBYTES: zone = zone_clust; break; #if MJUMPAGESIZE != MCLBYTES case MJUMPAGESIZE: zone = zone_jumbop; break; #endif case MJUM9BYTES: zone = zone_jumbo9; break; case MJUM16BYTES: zone = zone_jumbo16; break; default: panic("%s: invalid cluster size %d", __func__, size); } return (zone); } /* * Initialize an mbuf with linear storage. * * Inline because the consumer text overhead will be roughly the same to * initialize or call a function with this many parameters and M_PKTHDR * should go away with constant propagation for !MGETHDR. */ static __inline int m_init(struct mbuf *m, int how, short type, int flags) { int error; m->m_next = NULL; m->m_nextpkt = NULL; m->m_data = m->m_dat; m->m_len = 0; m->m_flags = flags; m->m_type = type; if (flags & M_PKTHDR) error = m_pkthdr_init(m, how); else error = 0; MBUF_PROBE5(m__init, m, how, type, flags, error); return (error); } static __inline struct mbuf * m_get(int how, short type) { struct mbuf *m; struct mb_args args; args.flags = 0; args.type = type; m = uma_zalloc_arg(zone_mbuf, &args, how); MBUF_PROBE3(m__get, how, type, m); return (m); } static __inline struct mbuf * m_gethdr(int how, short type) { struct mbuf *m; struct mb_args args; args.flags = M_PKTHDR; args.type = type; m = uma_zalloc_arg(zone_mbuf, &args, how); MBUF_PROBE3(m__gethdr, how, type, m); return (m); } static __inline struct mbuf * m_getcl(int how, short type, int flags) { struct mbuf *m; struct mb_args args; args.flags = flags; args.type = type; m = uma_zalloc_arg(zone_pack, &args, how); MBUF_PROBE4(m__getcl, how, type, flags, m); return (m); } /* * XXX: m_cljset() is a dangerous API. One must attach only a new, * unreferenced cluster to an mbuf(9). It is not possible to assert * that, so care can be taken only by users of the API. */ static __inline void m_cljset(struct mbuf *m, void *cl, int type) { int size; switch (type) { case EXT_CLUSTER: size = MCLBYTES; break; #if MJUMPAGESIZE != MCLBYTES case EXT_JUMBOP: size = MJUMPAGESIZE; break; #endif case EXT_JUMBO9: size = MJUM9BYTES; break; case EXT_JUMBO16: size = MJUM16BYTES; break; default: panic("%s: unknown cluster type %d", __func__, type); break; } m->m_data = m->m_ext.ext_buf = cl; m->m_ext.ext_free = m->m_ext.ext_arg1 = m->m_ext.ext_arg2 = NULL; m->m_ext.ext_size = size; m->m_ext.ext_type = type; m->m_ext.ext_flags = EXT_FLAG_EMBREF; m->m_ext.ext_count = 1; m->m_flags |= M_EXT; MBUF_PROBE3(m__cljset, m, cl, type); } static __inline void m_chtype(struct mbuf *m, short new_type) { m->m_type = new_type; } static __inline void m_clrprotoflags(struct mbuf *m) { while (m) { m->m_flags &= ~M_PROTOFLAGS; m = m->m_next; } } static __inline struct mbuf * m_last(struct mbuf *m) { while (m->m_next) m = m->m_next; return (m); } static inline u_int m_extrefcnt(struct mbuf *m) { KASSERT(m->m_flags & M_EXT, ("%s: M_EXT missing", __func__)); return ((m->m_ext.ext_flags & EXT_FLAG_EMBREF) ? m->m_ext.ext_count : *m->m_ext.ext_cnt); } /* * mbuf, cluster, and external object allocation macros (for compatibility * purposes). */ #define M_MOVE_PKTHDR(to, from) m_move_pkthdr((to), (from)) #define MGET(m, how, type) ((m) = m_get((how), (type))) #define MGETHDR(m, how, type) ((m) = m_gethdr((how), (type))) #define MCLGET(m, how) m_clget((m), (how)) #define MEXTADD(m, buf, size, free, arg1, arg2, flags, type) \ m_extadd((m), (char *)(buf), (size), (free), (arg1), (arg2), \ (flags), (type)) #define m_getm(m, len, how, type) \ m_getm2((m), (len), (how), (type), M_PKTHDR) /* * Evaluate TRUE if it's safe to write to the mbuf m's data region (this can * be both the local data payload, or an external buffer area, depending on * whether M_EXT is set). */ #define M_WRITABLE(m) (!((m)->m_flags & M_RDONLY) && \ (!(((m)->m_flags & M_EXT)) || \ (m_extrefcnt(m) == 1))) /* Check if the supplied mbuf has a packet header, or else panic. */ #define M_ASSERTPKTHDR(m) \ KASSERT((m) != NULL && (m)->m_flags & M_PKTHDR, \ ("%s: no mbuf packet header!", __func__)) /* * Ensure that the supplied mbuf is a valid, non-free mbuf. * * XXX: Broken at the moment. Need some UMA magic to make it work again. */ #define M_ASSERTVALID(m) \ KASSERT((((struct mbuf *)m)->m_flags & 0) == 0, \ ("%s: attempted use of a free mbuf!", __func__)) /* * Return the address of the start of the buffer associated with an mbuf, * handling external storage, packet-header mbufs, and regular data mbufs. */ #define M_START(m) \ (((m)->m_flags & M_EXT) ? (m)->m_ext.ext_buf : \ ((m)->m_flags & M_PKTHDR) ? &(m)->m_pktdat[0] : \ &(m)->m_dat[0]) /* * Return the size of the buffer associated with an mbuf, handling external * storage, packet-header mbufs, and regular data mbufs. */ #define M_SIZE(m) \ (((m)->m_flags & M_EXT) ? (m)->m_ext.ext_size : \ ((m)->m_flags & M_PKTHDR) ? MHLEN : \ MLEN) /* * Set the m_data pointer of a newly allocated mbuf to place an object of the * specified size at the end of the mbuf, longword aligned. * * NB: Historically, we had M_ALIGN(), MH_ALIGN(), and MEXT_ALIGN() as * separate macros, each asserting that it was called at the proper moment. * This required callers to themselves test the storage type and call the * right one. Rather than require callers to be aware of those layout * decisions, we centralize here. */ static __inline void m_align(struct mbuf *m, int len) { #ifdef INVARIANTS const char *msg = "%s: not a virgin mbuf"; #endif int adjust; KASSERT(m->m_data == M_START(m), (msg, __func__)); adjust = M_SIZE(m) - len; m->m_data += adjust &~ (sizeof(long)-1); } #define M_ALIGN(m, len) m_align(m, len) #define MH_ALIGN(m, len) m_align(m, len) #define MEXT_ALIGN(m, len) m_align(m, len) /* * Compute the amount of space available before the current start of data in * an mbuf. * * The M_WRITABLE() is a temporary, conservative safety measure: the burden * of checking writability of the mbuf data area rests solely with the caller. * * NB: In previous versions, M_LEADINGSPACE() would only check M_WRITABLE() * for mbufs with external storage. We now allow mbuf-embedded data to be * read-only as well. */ #define M_LEADINGSPACE(m) \ (M_WRITABLE(m) ? ((m)->m_data - M_START(m)) : 0) /* * Compute the amount of space available after the end of data in an mbuf. * * The M_WRITABLE() is a temporary, conservative safety measure: the burden * of checking writability of the mbuf data area rests solely with the caller. * * NB: In previous versions, M_TRAILINGSPACE() would only check M_WRITABLE() * for mbufs with external storage. We now allow mbuf-embedded data to be * read-only as well. */ #define M_TRAILINGSPACE(m) \ (M_WRITABLE(m) ? \ ((M_START(m) + M_SIZE(m)) - ((m)->m_data + (m)->m_len)) : 0) /* * Arrange to prepend space of size plen to mbuf m. If a new mbuf must be * allocated, how specifies whether to wait. If the allocation fails, the * original mbuf chain is freed and m is set to NULL. */ #define M_PREPEND(m, plen, how) do { \ struct mbuf **_mmp = &(m); \ struct mbuf *_mm = *_mmp; \ int _mplen = (plen); \ int __mhow = (how); \ \ MBUF_CHECKSLEEP(how); \ if (M_LEADINGSPACE(_mm) >= _mplen) { \ _mm->m_data -= _mplen; \ _mm->m_len += _mplen; \ } else \ _mm = m_prepend(_mm, _mplen, __mhow); \ if (_mm != NULL && _mm->m_flags & M_PKTHDR) \ _mm->m_pkthdr.len += _mplen; \ *_mmp = _mm; \ } while (0) /* * Change mbuf to new type. This is a relatively expensive operation and * should be avoided. */ #define MCHTYPE(m, t) m_chtype((m), (t)) /* Length to m_copy to copy all. */ #define M_COPYALL 1000000000 extern int max_datalen; /* MHLEN - max_hdr */ extern int max_hdr; /* Largest link + protocol header */ extern int max_linkhdr; /* Largest link-level header */ extern int max_protohdr; /* Largest protocol header */ extern int nmbclusters; /* Maximum number of clusters */ /*- * Network packets may have annotations attached by affixing a list of * "packet tags" to the pkthdr structure. Packet tags are dynamically * allocated semi-opaque data structures that have a fixed header * (struct m_tag) that specifies the size of the memory block and a * pair that identifies it. The cookie is a 32-bit unique * unsigned value used to identify a module or ABI. By convention this value * is chosen as the date+time that the module is created, expressed as the * number of seconds since the epoch (e.g., using date -u +'%s'). The type * value is an ABI/module-specific value that identifies a particular * annotation and is private to the module. For compatibility with systems * like OpenBSD that define packet tags w/o an ABI/module cookie, the value * PACKET_ABI_COMPAT is used to implement m_tag_get and m_tag_find * compatibility shim functions and several tag types are defined below. * Users that do not require compatibility should use a private cookie value * so that packet tag-related definitions can be maintained privately. * * Note that the packet tag returned by m_tag_alloc has the default memory * alignment implemented by malloc. To reference private data one can use a * construct like: * * struct m_tag *mtag = m_tag_alloc(...); * struct foo *p = (struct foo *)(mtag+1); * * if the alignment of struct m_tag is sufficient for referencing members of * struct foo. Otherwise it is necessary to embed struct m_tag within the * private data structure to insure proper alignment; e.g., * * struct foo { * struct m_tag tag; * ... * }; * struct foo *p = (struct foo *) m_tag_alloc(...); * struct m_tag *mtag = &p->tag; */ /* * Persistent tags stay with an mbuf until the mbuf is reclaimed. Otherwise * tags are expected to ``vanish'' when they pass through a network * interface. For most interfaces this happens normally as the tags are * reclaimed when the mbuf is free'd. However in some special cases * reclaiming must be done manually. An example is packets that pass through * the loopback interface. Also, one must be careful to do this when * ``turning around'' packets (e.g., icmp_reflect). * * To mark a tag persistent bit-or this flag in when defining the tag id. * The tag will then be treated as described above. */ #define MTAG_PERSISTENT 0x800 #define PACKET_TAG_NONE 0 /* Nadda */ /* Packet tags for use with PACKET_ABI_COMPAT. */ #define PACKET_TAG_IPSEC_IN_DONE 1 /* IPsec applied, in */ #define PACKET_TAG_IPSEC_OUT_DONE 2 /* IPsec applied, out */ #define PACKET_TAG_IPSEC_IN_CRYPTO_DONE 3 /* NIC IPsec crypto done */ #define PACKET_TAG_IPSEC_OUT_CRYPTO_NEEDED 4 /* NIC IPsec crypto req'ed */ #define PACKET_TAG_IPSEC_IN_COULD_DO_CRYPTO 5 /* NIC notifies IPsec */ #define PACKET_TAG_IPSEC_PENDING_TDB 6 /* Reminder to do IPsec */ #define PACKET_TAG_BRIDGE 7 /* Bridge processing done */ #define PACKET_TAG_GIF 8 /* GIF processing done */ #define PACKET_TAG_GRE 9 /* GRE processing done */ #define PACKET_TAG_IN_PACKET_CHECKSUM 10 /* NIC checksumming done */ #define PACKET_TAG_ENCAP 11 /* Encap. processing */ #define PACKET_TAG_IPSEC_SOCKET 12 /* IPSEC socket ref */ #define PACKET_TAG_IPSEC_HISTORY 13 /* IPSEC history */ #define PACKET_TAG_IPV6_INPUT 14 /* IPV6 input processing */ #define PACKET_TAG_DUMMYNET 15 /* dummynet info */ #define PACKET_TAG_DIVERT 17 /* divert info */ #define PACKET_TAG_IPFORWARD 18 /* ipforward info */ #define PACKET_TAG_MACLABEL (19 | MTAG_PERSISTENT) /* MAC label */ #define PACKET_TAG_PF (21 | MTAG_PERSISTENT) /* PF/ALTQ information */ #define PACKET_TAG_RTSOCKFAM 25 /* rtsock sa family */ #define PACKET_TAG_IPOPTIONS 27 /* Saved IP options */ #define PACKET_TAG_CARP 28 /* CARP info */ #define PACKET_TAG_IPSEC_NAT_T_PORTS 29 /* two uint16_t */ #define PACKET_TAG_ND_OUTGOING 30 /* ND outgoing */ /* Specific cookies and tags. */ /* Packet tag routines. */ struct m_tag *m_tag_alloc(u_int32_t, int, int, int); void m_tag_delete(struct mbuf *, struct m_tag *); void m_tag_delete_chain(struct mbuf *, struct m_tag *); void m_tag_free_default(struct m_tag *); struct m_tag *m_tag_locate(struct mbuf *, u_int32_t, int, struct m_tag *); struct m_tag *m_tag_copy(struct m_tag *, int); int m_tag_copy_chain(struct mbuf *, const struct mbuf *, int); void m_tag_delete_nonpersistent(struct mbuf *); /* * Initialize the list of tags associated with an mbuf. */ static __inline void m_tag_init(struct mbuf *m) { SLIST_INIT(&m->m_pkthdr.tags); } /* * Set up the contents of a tag. Note that this does not fill in the free * method; the caller is expected to do that. * * XXX probably should be called m_tag_init, but that was already taken. */ static __inline void m_tag_setup(struct m_tag *t, u_int32_t cookie, int type, int len) { t->m_tag_id = type; t->m_tag_len = len; t->m_tag_cookie = cookie; } /* * Reclaim resources associated with a tag. */ static __inline void m_tag_free(struct m_tag *t) { (*t->m_tag_free)(t); } /* * Return the first tag associated with an mbuf. */ static __inline struct m_tag * m_tag_first(struct mbuf *m) { return (SLIST_FIRST(&m->m_pkthdr.tags)); } /* * Return the next tag in the list of tags associated with an mbuf. */ static __inline struct m_tag * m_tag_next(struct mbuf *m __unused, struct m_tag *t) { return (SLIST_NEXT(t, m_tag_link)); } /* * Prepend a tag to the list of tags associated with an mbuf. */ static __inline void m_tag_prepend(struct mbuf *m, struct m_tag *t) { SLIST_INSERT_HEAD(&m->m_pkthdr.tags, t, m_tag_link); } /* * Unlink a tag from the list of tags associated with an mbuf. */ static __inline void m_tag_unlink(struct mbuf *m, struct m_tag *t) { SLIST_REMOVE(&m->m_pkthdr.tags, t, m_tag, m_tag_link); } /* These are for OpenBSD compatibility. */ #define MTAG_ABI_COMPAT 0 /* compatibility ABI */ static __inline struct m_tag * m_tag_get(int type, int length, int wait) { return (m_tag_alloc(MTAG_ABI_COMPAT, type, length, wait)); } static __inline struct m_tag * m_tag_find(struct mbuf *m, int type, struct m_tag *start) { return (SLIST_EMPTY(&m->m_pkthdr.tags) ? (struct m_tag *)NULL : m_tag_locate(m, MTAG_ABI_COMPAT, type, start)); } static __inline struct mbuf * m_free(struct mbuf *m) { struct mbuf *n = m->m_next; MBUF_PROBE1(m__free, m); if ((m->m_flags & (M_PKTHDR|M_NOFREE)) == (M_PKTHDR|M_NOFREE)) m_tag_delete_chain(m, NULL); if (m->m_flags & M_EXT) mb_free_ext(m); else if ((m->m_flags & M_NOFREE) == 0) uma_zfree(zone_mbuf, m); return (n); } static __inline int rt_m_getfib(struct mbuf *m) { KASSERT(m->m_flags & M_PKTHDR , ("Attempt to get FIB from non header mbuf.")); return (m->m_pkthdr.fibnum); } #define M_GETFIB(_m) rt_m_getfib(_m) #define M_SETFIB(_m, _fib) do { \ KASSERT((_m)->m_flags & M_PKTHDR, ("Attempt to set FIB on non header mbuf.")); \ ((_m)->m_pkthdr.fibnum) = (_fib); \ } while (0) /* flags passed as first argument for "m_ether_tcpip_hash()" */ #define MBUF_HASHFLAG_L2 (1 << 2) #define MBUF_HASHFLAG_L3 (1 << 3) #define MBUF_HASHFLAG_L4 (1 << 4) /* mbuf hashing helper routines */ uint32_t m_ether_tcpip_hash_init(void); uint32_t m_ether_tcpip_hash(const uint32_t, const struct mbuf *, const uint32_t); #ifdef MBUF_PROFILING void m_profile(struct mbuf *m); #define M_PROFILE(m) m_profile(m) #else #define M_PROFILE(m) #endif struct mbufq { STAILQ_HEAD(, mbuf) mq_head; int mq_len; int mq_maxlen; }; static inline void mbufq_init(struct mbufq *mq, int maxlen) { STAILQ_INIT(&mq->mq_head); mq->mq_maxlen = maxlen; mq->mq_len = 0; } static inline struct mbuf * mbufq_flush(struct mbufq *mq) { struct mbuf *m; m = STAILQ_FIRST(&mq->mq_head); STAILQ_INIT(&mq->mq_head); mq->mq_len = 0; return (m); } static inline void mbufq_drain(struct mbufq *mq) { struct mbuf *m, *n; n = mbufq_flush(mq); while ((m = n) != NULL) { n = STAILQ_NEXT(m, m_stailqpkt); m_freem(m); } } static inline struct mbuf * mbufq_first(const struct mbufq *mq) { return (STAILQ_FIRST(&mq->mq_head)); } static inline struct mbuf * mbufq_last(const struct mbufq *mq) { return (STAILQ_LAST(&mq->mq_head, mbuf, m_stailqpkt)); } static inline int mbufq_full(const struct mbufq *mq) { return (mq->mq_len >= mq->mq_maxlen); } static inline int mbufq_len(const struct mbufq *mq) { return (mq->mq_len); } static inline int mbufq_enqueue(struct mbufq *mq, struct mbuf *m) { if (mbufq_full(mq)) return (ENOBUFS); STAILQ_INSERT_TAIL(&mq->mq_head, m, m_stailqpkt); mq->mq_len++; return (0); } static inline struct mbuf * mbufq_dequeue(struct mbufq *mq) { struct mbuf *m; m = STAILQ_FIRST(&mq->mq_head); if (m) { STAILQ_REMOVE_HEAD(&mq->mq_head, m_stailqpkt); m->m_nextpkt = NULL; mq->mq_len--; } return (m); } static inline void mbufq_prepend(struct mbufq *mq, struct mbuf *m) { STAILQ_INSERT_HEAD(&mq->mq_head, m, m_stailqpkt); mq->mq_len++; } /* * Note: this doesn't enforce the maximum list size for dst. */ static inline void mbufq_concat(struct mbufq *mq_dst, struct mbufq *mq_src) { mq_dst->mq_len += mq_src->mq_len; STAILQ_CONCAT(&mq_dst->mq_head, &mq_src->mq_head); mq_src->mq_len = 0; } #ifdef _SYS_TIMESPEC_H_ static inline void mbuf_tstmp2timespec(struct mbuf *m, struct timespec *ts) { KASSERT((m->m_flags & M_PKTHDR) != 0, ("mbuf %p no M_PKTHDR", m)); KASSERT((m->m_flags & M_TSTMP) != 0, ("mbuf %p no M_TSTMP", m)); ts->tv_sec = m->m_pkthdr.rcv_tstmp / 1000000000; ts->tv_nsec = m->m_pkthdr.rcv_tstmp % 1000000000; } #endif #ifdef NETDUMP /* Invoked from the netdump client code. */ void netdump_mbuf_drain(void); void netdump_mbuf_dump(void); void netdump_mbuf_reinit(int nmbuf, int nclust, int clsize); #endif #endif /* _KERNEL */ #endif /* !_SYS_MBUF_H_ */ Index: head/sys/sys/queue.h =================================================================== --- head/sys/sys/queue.h (revision 334803) +++ head/sys/sys/queue.h (revision 334804) @@ -1,860 +1,871 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)queue.h 8.5 (Berkeley) 8/20/94 * $FreeBSD$ */ #ifndef _SYS_QUEUE_H_ #define _SYS_QUEUE_H_ #include /* * This file defines four types of data structures: singly-linked lists, * singly-linked tail queues, lists and tail queues. * * A singly-linked list is headed by a single forward pointer. The elements * are singly linked for minimum space and pointer manipulation overhead at * the expense of O(n) removal for arbitrary elements. New elements can be * added to the list after an existing element or at the head of the list. * Elements being removed from the head of the list should use the explicit * macro for this purpose for optimum efficiency. A singly-linked list may * only be traversed in the forward direction. Singly-linked lists are ideal * for applications with large datasets and few or no removals or for * implementing a LIFO queue. * * A singly-linked tail queue is headed by a pair of pointers, one to the * head of the list and the other to the tail of the list. The elements are * singly linked for minimum space and pointer manipulation overhead at the * expense of O(n) removal for arbitrary elements. New elements can be added * to the list after an existing element, at the head of the list, or at the * end of the list. Elements being removed from the head of the tail queue * should use the explicit macro for this purpose for optimum efficiency. * A singly-linked tail queue may only be traversed in the forward direction. * Singly-linked tail queues are ideal for applications with large datasets * and few or no removals or for implementing a FIFO queue. * * A list is headed by a single forward pointer (or an array of forward * pointers for a hash table header). The elements are doubly linked * so that an arbitrary element can be removed without a need to * traverse the list. New elements can be added to the list before * or after an existing element or at the head of the list. A list * may be traversed in either direction. * * A tail queue is headed by a pair of pointers, one to the head of the * list and the other to the tail of the list. The elements are doubly * linked so that an arbitrary element can be removed without a need to * traverse the list. New elements can be added to the list before or * after an existing element, at the head of the list, or at the end of * the list. A tail queue may be traversed in either direction. * * For details on the use of these macros, see the queue(3) manual page. * * Below is a summary of implemented functions where: * + means the macro is available * - means the macro is not available * s means the macro is available but is slow (runs in O(n) time) * * SLIST LIST STAILQ TAILQ * _HEAD + + + + * _CLASS_HEAD + + + + * _HEAD_INITIALIZER + + + + * _ENTRY + + + + * _CLASS_ENTRY + + + + * _INIT + + + + * _EMPTY + + + + * _FIRST + + + + * _NEXT + + + + * _PREV - + - + * _LAST - - + + + * _LAST_FAST - - - + * _FOREACH + + + + * _FOREACH_FROM + + + + * _FOREACH_SAFE + + + + * _FOREACH_FROM_SAFE + + + + * _FOREACH_REVERSE - - - + * _FOREACH_REVERSE_FROM - - - + * _FOREACH_REVERSE_SAFE - - - + * _FOREACH_REVERSE_FROM_SAFE - - - + * _INSERT_HEAD + + + + * _INSERT_BEFORE - + - + * _INSERT_AFTER + + + + * _INSERT_TAIL - - + + * _CONCAT s s + + * _REMOVE_AFTER + - + - * _REMOVE_HEAD + - + - * _REMOVE s + s + * _SWAP + + + + * */ #ifdef QUEUE_MACRO_DEBUG #warn Use QUEUE_MACRO_DEBUG_TRACE and/or QUEUE_MACRO_DEBUG_TRASH #define QUEUE_MACRO_DEBUG_TRACE #define QUEUE_MACRO_DEBUG_TRASH #endif #ifdef QUEUE_MACRO_DEBUG_TRACE /* Store the last 2 places the queue element or head was altered */ struct qm_trace { unsigned long lastline; unsigned long prevline; const char *lastfile; const char *prevfile; }; #define TRACEBUF struct qm_trace trace; #define TRACEBUF_INITIALIZER { __LINE__, 0, __FILE__, NULL } , #define QMD_TRACE_HEAD(head) do { \ (head)->trace.prevline = (head)->trace.lastline; \ (head)->trace.prevfile = (head)->trace.lastfile; \ (head)->trace.lastline = __LINE__; \ (head)->trace.lastfile = __FILE__; \ } while (0) #define QMD_TRACE_ELEM(elem) do { \ (elem)->trace.prevline = (elem)->trace.lastline; \ (elem)->trace.prevfile = (elem)->trace.lastfile; \ (elem)->trace.lastline = __LINE__; \ (elem)->trace.lastfile = __FILE__; \ } while (0) #else /* !QUEUE_MACRO_DEBUG_TRACE */ #define QMD_TRACE_ELEM(elem) #define QMD_TRACE_HEAD(head) #define TRACEBUF #define TRACEBUF_INITIALIZER #endif /* QUEUE_MACRO_DEBUG_TRACE */ #ifdef QUEUE_MACRO_DEBUG_TRASH #define TRASHIT(x) do {(x) = (void *)-1;} while (0) #define QMD_IS_TRASHED(x) ((x) == (void *)(intptr_t)-1) #else /* !QUEUE_MACRO_DEBUG_TRASH */ #define TRASHIT(x) #define QMD_IS_TRASHED(x) 0 #endif /* QUEUE_MACRO_DEBUG_TRASH */ #if defined(QUEUE_MACRO_DEBUG_TRACE) || defined(QUEUE_MACRO_DEBUG_TRASH) #define QMD_SAVELINK(name, link) void **name = (void *)&(link) #else /* !QUEUE_MACRO_DEBUG_TRACE && !QUEUE_MACRO_DEBUG_TRASH */ #define QMD_SAVELINK(name, link) #endif /* QUEUE_MACRO_DEBUG_TRACE || QUEUE_MACRO_DEBUG_TRASH */ #ifdef __cplusplus /* * In C++ there can be structure lists and class lists: */ #define QUEUE_TYPEOF(type) type #else #define QUEUE_TYPEOF(type) struct type #endif /* * Singly-linked List declarations. */ #define SLIST_HEAD(name, type) \ struct name { \ struct type *slh_first; /* first element */ \ } #define SLIST_CLASS_HEAD(name, type) \ struct name { \ class type *slh_first; /* first element */ \ } #define SLIST_HEAD_INITIALIZER(head) \ { NULL } #define SLIST_ENTRY(type) \ struct { \ struct type *sle_next; /* next element */ \ } #define SLIST_CLASS_ENTRY(type) \ struct { \ class type *sle_next; /* next element */ \ } /* * Singly-linked List functions. */ #if (defined(_KERNEL) && defined(INVARIANTS)) #define QMD_SLIST_CHECK_PREVPTR(prevp, elm) do { \ if (*(prevp) != (elm)) \ panic("Bad prevptr *(%p) == %p != %p", \ (prevp), *(prevp), (elm)); \ } while (0) #else #define QMD_SLIST_CHECK_PREVPTR(prevp, elm) #endif #define SLIST_CONCAT(head1, head2, type, field) do { \ QUEUE_TYPEOF(type) *curelm = SLIST_FIRST(head1); \ if (curelm == NULL) { \ if ((SLIST_FIRST(head1) = SLIST_FIRST(head2)) != NULL) \ SLIST_INIT(head2); \ } else if (SLIST_FIRST(head2) != NULL) { \ while (SLIST_NEXT(curelm, field) != NULL) \ curelm = SLIST_NEXT(curelm, field); \ SLIST_NEXT(curelm, field) = SLIST_FIRST(head2); \ SLIST_INIT(head2); \ } \ } while (0) #define SLIST_EMPTY(head) ((head)->slh_first == NULL) #define SLIST_FIRST(head) ((head)->slh_first) #define SLIST_FOREACH(var, head, field) \ for ((var) = SLIST_FIRST((head)); \ (var); \ (var) = SLIST_NEXT((var), field)) #define SLIST_FOREACH_FROM(var, head, field) \ for ((var) = ((var) ? (var) : SLIST_FIRST((head))); \ (var); \ (var) = SLIST_NEXT((var), field)) #define SLIST_FOREACH_SAFE(var, head, field, tvar) \ for ((var) = SLIST_FIRST((head)); \ (var) && ((tvar) = SLIST_NEXT((var), field), 1); \ (var) = (tvar)) #define SLIST_FOREACH_FROM_SAFE(var, head, field, tvar) \ for ((var) = ((var) ? (var) : SLIST_FIRST((head))); \ (var) && ((tvar) = SLIST_NEXT((var), field), 1); \ (var) = (tvar)) #define SLIST_FOREACH_PREVPTR(var, varp, head, field) \ for ((varp) = &SLIST_FIRST((head)); \ ((var) = *(varp)) != NULL; \ (varp) = &SLIST_NEXT((var), field)) #define SLIST_INIT(head) do { \ SLIST_FIRST((head)) = NULL; \ } while (0) #define SLIST_INSERT_AFTER(slistelm, elm, field) do { \ SLIST_NEXT((elm), field) = SLIST_NEXT((slistelm), field); \ SLIST_NEXT((slistelm), field) = (elm); \ } while (0) #define SLIST_INSERT_HEAD(head, elm, field) do { \ SLIST_NEXT((elm), field) = SLIST_FIRST((head)); \ SLIST_FIRST((head)) = (elm); \ } while (0) #define SLIST_NEXT(elm, field) ((elm)->field.sle_next) #define SLIST_REMOVE(head, elm, type, field) do { \ QMD_SAVELINK(oldnext, (elm)->field.sle_next); \ if (SLIST_FIRST((head)) == (elm)) { \ SLIST_REMOVE_HEAD((head), field); \ } \ else { \ QUEUE_TYPEOF(type) *curelm = SLIST_FIRST(head); \ while (SLIST_NEXT(curelm, field) != (elm)) \ curelm = SLIST_NEXT(curelm, field); \ SLIST_REMOVE_AFTER(curelm, field); \ } \ TRASHIT(*oldnext); \ } while (0) #define SLIST_REMOVE_AFTER(elm, field) do { \ SLIST_NEXT(elm, field) = \ SLIST_NEXT(SLIST_NEXT(elm, field), field); \ } while (0) #define SLIST_REMOVE_HEAD(head, field) do { \ SLIST_FIRST((head)) = SLIST_NEXT(SLIST_FIRST((head)), field); \ } while (0) #define SLIST_REMOVE_PREVPTR(prevp, elm, field) do { \ QMD_SLIST_CHECK_PREVPTR(prevp, elm); \ *(prevp) = SLIST_NEXT(elm, field); \ TRASHIT((elm)->field.sle_next); \ } while (0) #define SLIST_SWAP(head1, head2, type) do { \ QUEUE_TYPEOF(type) *swap_first = SLIST_FIRST(head1); \ SLIST_FIRST(head1) = SLIST_FIRST(head2); \ SLIST_FIRST(head2) = swap_first; \ } while (0) /* * Singly-linked Tail queue declarations. */ #define STAILQ_HEAD(name, type) \ struct name { \ struct type *stqh_first;/* first element */ \ struct type **stqh_last;/* addr of last next element */ \ } #define STAILQ_CLASS_HEAD(name, type) \ struct name { \ class type *stqh_first; /* first element */ \ class type **stqh_last; /* addr of last next element */ \ } #define STAILQ_HEAD_INITIALIZER(head) \ { NULL, &(head).stqh_first } #define STAILQ_ENTRY(type) \ struct { \ struct type *stqe_next; /* next element */ \ } #define STAILQ_CLASS_ENTRY(type) \ struct { \ class type *stqe_next; /* next element */ \ } /* * Singly-linked Tail queue functions. */ #define STAILQ_CONCAT(head1, head2) do { \ if (!STAILQ_EMPTY((head2))) { \ *(head1)->stqh_last = (head2)->stqh_first; \ (head1)->stqh_last = (head2)->stqh_last; \ STAILQ_INIT((head2)); \ } \ } while (0) #define STAILQ_EMPTY(head) ((head)->stqh_first == NULL) #define STAILQ_FIRST(head) ((head)->stqh_first) #define STAILQ_FOREACH(var, head, field) \ for((var) = STAILQ_FIRST((head)); \ (var); \ (var) = STAILQ_NEXT((var), field)) #define STAILQ_FOREACH_FROM(var, head, field) \ for ((var) = ((var) ? (var) : STAILQ_FIRST((head))); \ (var); \ (var) = STAILQ_NEXT((var), field)) #define STAILQ_FOREACH_SAFE(var, head, field, tvar) \ for ((var) = STAILQ_FIRST((head)); \ (var) && ((tvar) = STAILQ_NEXT((var), field), 1); \ (var) = (tvar)) #define STAILQ_FOREACH_FROM_SAFE(var, head, field, tvar) \ for ((var) = ((var) ? (var) : STAILQ_FIRST((head))); \ (var) && ((tvar) = STAILQ_NEXT((var), field), 1); \ (var) = (tvar)) #define STAILQ_INIT(head) do { \ STAILQ_FIRST((head)) = NULL; \ (head)->stqh_last = &STAILQ_FIRST((head)); \ } while (0) #define STAILQ_INSERT_AFTER(head, tqelm, elm, field) do { \ if ((STAILQ_NEXT((elm), field) = STAILQ_NEXT((tqelm), field)) == NULL)\ (head)->stqh_last = &STAILQ_NEXT((elm), field); \ STAILQ_NEXT((tqelm), field) = (elm); \ } while (0) #define STAILQ_INSERT_HEAD(head, elm, field) do { \ if ((STAILQ_NEXT((elm), field) = STAILQ_FIRST((head))) == NULL) \ (head)->stqh_last = &STAILQ_NEXT((elm), field); \ STAILQ_FIRST((head)) = (elm); \ } while (0) #define STAILQ_INSERT_TAIL(head, elm, field) do { \ STAILQ_NEXT((elm), field) = NULL; \ *(head)->stqh_last = (elm); \ (head)->stqh_last = &STAILQ_NEXT((elm), field); \ } while (0) #define STAILQ_LAST(head, type, field) \ (STAILQ_EMPTY((head)) ? NULL : \ __containerof((head)->stqh_last, \ QUEUE_TYPEOF(type), field.stqe_next)) #define STAILQ_NEXT(elm, field) ((elm)->field.stqe_next) #define STAILQ_REMOVE(head, elm, type, field) do { \ QMD_SAVELINK(oldnext, (elm)->field.stqe_next); \ if (STAILQ_FIRST((head)) == (elm)) { \ STAILQ_REMOVE_HEAD((head), field); \ } \ else { \ QUEUE_TYPEOF(type) *curelm = STAILQ_FIRST(head); \ while (STAILQ_NEXT(curelm, field) != (elm)) \ curelm = STAILQ_NEXT(curelm, field); \ STAILQ_REMOVE_AFTER(head, curelm, field); \ } \ TRASHIT(*oldnext); \ } while (0) #define STAILQ_REMOVE_AFTER(head, elm, field) do { \ if ((STAILQ_NEXT(elm, field) = \ STAILQ_NEXT(STAILQ_NEXT(elm, field), field)) == NULL) \ (head)->stqh_last = &STAILQ_NEXT((elm), field); \ } while (0) #define STAILQ_REMOVE_HEAD(head, field) do { \ if ((STAILQ_FIRST((head)) = \ STAILQ_NEXT(STAILQ_FIRST((head)), field)) == NULL) \ (head)->stqh_last = &STAILQ_FIRST((head)); \ } while (0) #define STAILQ_SWAP(head1, head2, type) do { \ QUEUE_TYPEOF(type) *swap_first = STAILQ_FIRST(head1); \ QUEUE_TYPEOF(type) **swap_last = (head1)->stqh_last; \ STAILQ_FIRST(head1) = STAILQ_FIRST(head2); \ (head1)->stqh_last = (head2)->stqh_last; \ STAILQ_FIRST(head2) = swap_first; \ (head2)->stqh_last = swap_last; \ if (STAILQ_EMPTY(head1)) \ (head1)->stqh_last = &STAILQ_FIRST(head1); \ if (STAILQ_EMPTY(head2)) \ (head2)->stqh_last = &STAILQ_FIRST(head2); \ } while (0) /* * List declarations. */ #define LIST_HEAD(name, type) \ struct name { \ struct type *lh_first; /* first element */ \ } #define LIST_CLASS_HEAD(name, type) \ struct name { \ class type *lh_first; /* first element */ \ } #define LIST_HEAD_INITIALIZER(head) \ { NULL } #define LIST_ENTRY(type) \ struct { \ struct type *le_next; /* next element */ \ struct type **le_prev; /* address of previous next element */ \ } #define LIST_CLASS_ENTRY(type) \ struct { \ class type *le_next; /* next element */ \ class type **le_prev; /* address of previous next element */ \ } /* * List functions. */ #if (defined(_KERNEL) && defined(INVARIANTS)) /* * QMD_LIST_CHECK_HEAD(LIST_HEAD *head, LIST_ENTRY NAME) * * If the list is non-empty, validates that the first element of the list * points back at 'head.' */ #define QMD_LIST_CHECK_HEAD(head, field) do { \ if (LIST_FIRST((head)) != NULL && \ LIST_FIRST((head))->field.le_prev != \ &LIST_FIRST((head))) \ panic("Bad list head %p first->prev != head", (head)); \ } while (0) /* * QMD_LIST_CHECK_NEXT(TYPE *elm, LIST_ENTRY NAME) * * If an element follows 'elm' in the list, validates that the next element * points back at 'elm.' */ #define QMD_LIST_CHECK_NEXT(elm, field) do { \ if (LIST_NEXT((elm), field) != NULL && \ LIST_NEXT((elm), field)->field.le_prev != \ &((elm)->field.le_next)) \ panic("Bad link elm %p next->prev != elm", (elm)); \ } while (0) /* * QMD_LIST_CHECK_PREV(TYPE *elm, LIST_ENTRY NAME) * * Validates that the previous element (or head of the list) points to 'elm.' */ #define QMD_LIST_CHECK_PREV(elm, field) do { \ if (*(elm)->field.le_prev != (elm)) \ panic("Bad link elm %p prev->next != elm", (elm)); \ } while (0) #else #define QMD_LIST_CHECK_HEAD(head, field) #define QMD_LIST_CHECK_NEXT(elm, field) #define QMD_LIST_CHECK_PREV(elm, field) #endif /* (_KERNEL && INVARIANTS) */ #define LIST_CONCAT(head1, head2, type, field) do { \ QUEUE_TYPEOF(type) *curelm = LIST_FIRST(head1); \ if (curelm == NULL) { \ if ((LIST_FIRST(head1) = LIST_FIRST(head2)) != NULL) { \ LIST_FIRST(head2)->field.le_prev = \ &LIST_FIRST((head1)); \ LIST_INIT(head2); \ } \ } else if (LIST_FIRST(head2) != NULL) { \ while (LIST_NEXT(curelm, field) != NULL) \ curelm = LIST_NEXT(curelm, field); \ LIST_NEXT(curelm, field) = LIST_FIRST(head2); \ LIST_FIRST(head2)->field.le_prev = &LIST_NEXT(curelm, field); \ LIST_INIT(head2); \ } \ } while (0) #define LIST_EMPTY(head) ((head)->lh_first == NULL) #define LIST_FIRST(head) ((head)->lh_first) #define LIST_FOREACH(var, head, field) \ for ((var) = LIST_FIRST((head)); \ (var); \ (var) = LIST_NEXT((var), field)) #define LIST_FOREACH_FROM(var, head, field) \ for ((var) = ((var) ? (var) : LIST_FIRST((head))); \ (var); \ (var) = LIST_NEXT((var), field)) #define LIST_FOREACH_SAFE(var, head, field, tvar) \ for ((var) = LIST_FIRST((head)); \ (var) && ((tvar) = LIST_NEXT((var), field), 1); \ (var) = (tvar)) #define LIST_FOREACH_FROM_SAFE(var, head, field, tvar) \ for ((var) = ((var) ? (var) : LIST_FIRST((head))); \ (var) && ((tvar) = LIST_NEXT((var), field), 1); \ (var) = (tvar)) #define LIST_INIT(head) do { \ LIST_FIRST((head)) = NULL; \ } while (0) #define LIST_INSERT_AFTER(listelm, elm, field) do { \ QMD_LIST_CHECK_NEXT(listelm, field); \ if ((LIST_NEXT((elm), field) = LIST_NEXT((listelm), field)) != NULL)\ LIST_NEXT((listelm), field)->field.le_prev = \ &LIST_NEXT((elm), field); \ LIST_NEXT((listelm), field) = (elm); \ (elm)->field.le_prev = &LIST_NEXT((listelm), field); \ } while (0) #define LIST_INSERT_BEFORE(listelm, elm, field) do { \ QMD_LIST_CHECK_PREV(listelm, field); \ (elm)->field.le_prev = (listelm)->field.le_prev; \ LIST_NEXT((elm), field) = (listelm); \ *(listelm)->field.le_prev = (elm); \ (listelm)->field.le_prev = &LIST_NEXT((elm), field); \ } while (0) #define LIST_INSERT_HEAD(head, elm, field) do { \ QMD_LIST_CHECK_HEAD((head), field); \ if ((LIST_NEXT((elm), field) = LIST_FIRST((head))) != NULL) \ LIST_FIRST((head))->field.le_prev = &LIST_NEXT((elm), field);\ LIST_FIRST((head)) = (elm); \ (elm)->field.le_prev = &LIST_FIRST((head)); \ } while (0) #define LIST_NEXT(elm, field) ((elm)->field.le_next) #define LIST_PREV(elm, head, type, field) \ ((elm)->field.le_prev == &LIST_FIRST((head)) ? NULL : \ __containerof((elm)->field.le_prev, \ QUEUE_TYPEOF(type), field.le_next)) #define LIST_REMOVE(elm, field) do { \ QMD_SAVELINK(oldnext, (elm)->field.le_next); \ QMD_SAVELINK(oldprev, (elm)->field.le_prev); \ QMD_LIST_CHECK_NEXT(elm, field); \ QMD_LIST_CHECK_PREV(elm, field); \ if (LIST_NEXT((elm), field) != NULL) \ LIST_NEXT((elm), field)->field.le_prev = \ (elm)->field.le_prev; \ *(elm)->field.le_prev = LIST_NEXT((elm), field); \ TRASHIT(*oldnext); \ TRASHIT(*oldprev); \ } while (0) #define LIST_SWAP(head1, head2, type, field) do { \ QUEUE_TYPEOF(type) *swap_tmp = LIST_FIRST(head1); \ LIST_FIRST((head1)) = LIST_FIRST((head2)); \ LIST_FIRST((head2)) = swap_tmp; \ if ((swap_tmp = LIST_FIRST((head1))) != NULL) \ swap_tmp->field.le_prev = &LIST_FIRST((head1)); \ if ((swap_tmp = LIST_FIRST((head2))) != NULL) \ swap_tmp->field.le_prev = &LIST_FIRST((head2)); \ } while (0) /* * Tail queue declarations. */ #define TAILQ_HEAD(name, type) \ struct name { \ struct type *tqh_first; /* first element */ \ struct type **tqh_last; /* addr of last next element */ \ TRACEBUF \ } #define TAILQ_CLASS_HEAD(name, type) \ struct name { \ class type *tqh_first; /* first element */ \ class type **tqh_last; /* addr of last next element */ \ TRACEBUF \ } #define TAILQ_HEAD_INITIALIZER(head) \ { NULL, &(head).tqh_first, TRACEBUF_INITIALIZER } #define TAILQ_ENTRY(type) \ struct { \ struct type *tqe_next; /* next element */ \ struct type **tqe_prev; /* address of previous next element */ \ TRACEBUF \ } #define TAILQ_CLASS_ENTRY(type) \ struct { \ class type *tqe_next; /* next element */ \ class type **tqe_prev; /* address of previous next element */ \ TRACEBUF \ } /* * Tail queue functions. */ #if (defined(_KERNEL) && defined(INVARIANTS)) /* * QMD_TAILQ_CHECK_HEAD(TAILQ_HEAD *head, TAILQ_ENTRY NAME) * * If the tailq is non-empty, validates that the first element of the tailq * points back at 'head.' */ #define QMD_TAILQ_CHECK_HEAD(head, field) do { \ if (!TAILQ_EMPTY(head) && \ TAILQ_FIRST((head))->field.tqe_prev != \ &TAILQ_FIRST((head))) \ panic("Bad tailq head %p first->prev != head", (head)); \ } while (0) /* * QMD_TAILQ_CHECK_TAIL(TAILQ_HEAD *head, TAILQ_ENTRY NAME) * * Validates that the tail of the tailq is a pointer to pointer to NULL. */ #define QMD_TAILQ_CHECK_TAIL(head, field) do { \ if (*(head)->tqh_last != NULL) \ panic("Bad tailq NEXT(%p->tqh_last) != NULL", (head)); \ } while (0) /* * QMD_TAILQ_CHECK_NEXT(TYPE *elm, TAILQ_ENTRY NAME) * * If an element follows 'elm' in the tailq, validates that the next element * points back at 'elm.' */ #define QMD_TAILQ_CHECK_NEXT(elm, field) do { \ if (TAILQ_NEXT((elm), field) != NULL && \ TAILQ_NEXT((elm), field)->field.tqe_prev != \ &((elm)->field.tqe_next)) \ panic("Bad link elm %p next->prev != elm", (elm)); \ } while (0) /* * QMD_TAILQ_CHECK_PREV(TYPE *elm, TAILQ_ENTRY NAME) * * Validates that the previous element (or head of the tailq) points to 'elm.' */ #define QMD_TAILQ_CHECK_PREV(elm, field) do { \ if (*(elm)->field.tqe_prev != (elm)) \ panic("Bad link elm %p prev->next != elm", (elm)); \ } while (0) #else #define QMD_TAILQ_CHECK_HEAD(head, field) #define QMD_TAILQ_CHECK_TAIL(head, headname) #define QMD_TAILQ_CHECK_NEXT(elm, field) #define QMD_TAILQ_CHECK_PREV(elm, field) #endif /* (_KERNEL && INVARIANTS) */ #define TAILQ_CONCAT(head1, head2, field) do { \ if (!TAILQ_EMPTY(head2)) { \ *(head1)->tqh_last = (head2)->tqh_first; \ (head2)->tqh_first->field.tqe_prev = (head1)->tqh_last; \ (head1)->tqh_last = (head2)->tqh_last; \ TAILQ_INIT((head2)); \ QMD_TRACE_HEAD(head1); \ QMD_TRACE_HEAD(head2); \ } \ } while (0) #define TAILQ_EMPTY(head) ((head)->tqh_first == NULL) #define TAILQ_FIRST(head) ((head)->tqh_first) #define TAILQ_FOREACH(var, head, field) \ for ((var) = TAILQ_FIRST((head)); \ (var); \ (var) = TAILQ_NEXT((var), field)) #define TAILQ_FOREACH_FROM(var, head, field) \ for ((var) = ((var) ? (var) : TAILQ_FIRST((head))); \ (var); \ (var) = TAILQ_NEXT((var), field)) #define TAILQ_FOREACH_SAFE(var, head, field, tvar) \ for ((var) = TAILQ_FIRST((head)); \ (var) && ((tvar) = TAILQ_NEXT((var), field), 1); \ (var) = (tvar)) #define TAILQ_FOREACH_FROM_SAFE(var, head, field, tvar) \ for ((var) = ((var) ? (var) : TAILQ_FIRST((head))); \ (var) && ((tvar) = TAILQ_NEXT((var), field), 1); \ (var) = (tvar)) #define TAILQ_FOREACH_REVERSE(var, head, headname, field) \ for ((var) = TAILQ_LAST((head), headname); \ (var); \ (var) = TAILQ_PREV((var), headname, field)) #define TAILQ_FOREACH_REVERSE_FROM(var, head, headname, field) \ for ((var) = ((var) ? (var) : TAILQ_LAST((head), headname)); \ (var); \ (var) = TAILQ_PREV((var), headname, field)) #define TAILQ_FOREACH_REVERSE_SAFE(var, head, headname, field, tvar) \ for ((var) = TAILQ_LAST((head), headname); \ (var) && ((tvar) = TAILQ_PREV((var), headname, field), 1); \ (var) = (tvar)) #define TAILQ_FOREACH_REVERSE_FROM_SAFE(var, head, headname, field, tvar) \ for ((var) = ((var) ? (var) : TAILQ_LAST((head), headname)); \ (var) && ((tvar) = TAILQ_PREV((var), headname, field), 1); \ (var) = (tvar)) #define TAILQ_INIT(head) do { \ TAILQ_FIRST((head)) = NULL; \ (head)->tqh_last = &TAILQ_FIRST((head)); \ QMD_TRACE_HEAD(head); \ } while (0) #define TAILQ_INSERT_AFTER(head, listelm, elm, field) do { \ QMD_TAILQ_CHECK_NEXT(listelm, field); \ if ((TAILQ_NEXT((elm), field) = TAILQ_NEXT((listelm), field)) != NULL)\ TAILQ_NEXT((elm), field)->field.tqe_prev = \ &TAILQ_NEXT((elm), field); \ else { \ (head)->tqh_last = &TAILQ_NEXT((elm), field); \ QMD_TRACE_HEAD(head); \ } \ TAILQ_NEXT((listelm), field) = (elm); \ (elm)->field.tqe_prev = &TAILQ_NEXT((listelm), field); \ QMD_TRACE_ELEM(&(elm)->field); \ QMD_TRACE_ELEM(&(listelm)->field); \ } while (0) #define TAILQ_INSERT_BEFORE(listelm, elm, field) do { \ QMD_TAILQ_CHECK_PREV(listelm, field); \ (elm)->field.tqe_prev = (listelm)->field.tqe_prev; \ TAILQ_NEXT((elm), field) = (listelm); \ *(listelm)->field.tqe_prev = (elm); \ (listelm)->field.tqe_prev = &TAILQ_NEXT((elm), field); \ QMD_TRACE_ELEM(&(elm)->field); \ QMD_TRACE_ELEM(&(listelm)->field); \ } while (0) #define TAILQ_INSERT_HEAD(head, elm, field) do { \ QMD_TAILQ_CHECK_HEAD(head, field); \ if ((TAILQ_NEXT((elm), field) = TAILQ_FIRST((head))) != NULL) \ TAILQ_FIRST((head))->field.tqe_prev = \ &TAILQ_NEXT((elm), field); \ else \ (head)->tqh_last = &TAILQ_NEXT((elm), field); \ TAILQ_FIRST((head)) = (elm); \ (elm)->field.tqe_prev = &TAILQ_FIRST((head)); \ QMD_TRACE_HEAD(head); \ QMD_TRACE_ELEM(&(elm)->field); \ } while (0) #define TAILQ_INSERT_TAIL(head, elm, field) do { \ QMD_TAILQ_CHECK_TAIL(head, field); \ TAILQ_NEXT((elm), field) = NULL; \ (elm)->field.tqe_prev = (head)->tqh_last; \ *(head)->tqh_last = (elm); \ (head)->tqh_last = &TAILQ_NEXT((elm), field); \ QMD_TRACE_HEAD(head); \ QMD_TRACE_ELEM(&(elm)->field); \ } while (0) #define TAILQ_LAST(head, headname) \ (*(((struct headname *)((head)->tqh_last))->tqh_last)) + +/* + * The FAST function is fast in that it causes no data access other + * then the access to the head. The standard LAST function above + * will cause a data access of both the element you want and + * the previous element. FAST is very useful for instances when + * you may want to prefetch the last data element. + */ +#define TAILQ_LAST_FAST(head, type, field) \ + (TAILQ_EMPTY(head) ? NULL : __containerof((head)->tqh_last, QUEUE_TYPEOF(type), field.tqe_next)) #define TAILQ_NEXT(elm, field) ((elm)->field.tqe_next) #define TAILQ_PREV(elm, headname, field) \ (*(((struct headname *)((elm)->field.tqe_prev))->tqh_last)) #define TAILQ_REMOVE(head, elm, field) do { \ QMD_SAVELINK(oldnext, (elm)->field.tqe_next); \ QMD_SAVELINK(oldprev, (elm)->field.tqe_prev); \ QMD_TAILQ_CHECK_NEXT(elm, field); \ QMD_TAILQ_CHECK_PREV(elm, field); \ if ((TAILQ_NEXT((elm), field)) != NULL) \ TAILQ_NEXT((elm), field)->field.tqe_prev = \ (elm)->field.tqe_prev; \ else { \ (head)->tqh_last = (elm)->field.tqe_prev; \ QMD_TRACE_HEAD(head); \ } \ *(elm)->field.tqe_prev = TAILQ_NEXT((elm), field); \ TRASHIT(*oldnext); \ TRASHIT(*oldprev); \ QMD_TRACE_ELEM(&(elm)->field); \ } while (0) #define TAILQ_SWAP(head1, head2, type, field) do { \ QUEUE_TYPEOF(type) *swap_first = (head1)->tqh_first; \ QUEUE_TYPEOF(type) **swap_last = (head1)->tqh_last; \ (head1)->tqh_first = (head2)->tqh_first; \ (head1)->tqh_last = (head2)->tqh_last; \ (head2)->tqh_first = swap_first; \ (head2)->tqh_last = swap_last; \ if ((swap_first = (head1)->tqh_first) != NULL) \ swap_first->field.tqe_prev = &(head1)->tqh_first; \ else \ (head1)->tqh_last = &(head1)->tqh_first; \ if ((swap_first = (head2)->tqh_first) != NULL) \ swap_first->field.tqe_prev = &(head2)->tqh_first; \ else \ (head2)->tqh_last = &(head2)->tqh_first; \ } while (0) #endif /* !_SYS_QUEUE_H_ */ Index: head/sys/sys/sockbuf.h =================================================================== --- head/sys/sys/sockbuf.h (revision 334803) +++ head/sys/sys/sockbuf.h (revision 334804) @@ -1,250 +1,254 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1990, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)socketvar.h 8.3 (Berkeley) 2/19/95 * * $FreeBSD$ */ #ifndef _SYS_SOCKBUF_H_ #define _SYS_SOCKBUF_H_ /* * Constants for sb_flags field of struct sockbuf/xsockbuf. */ #define SB_WAIT 0x04 /* someone is waiting for data/space */ #define SB_SEL 0x08 /* someone is selecting */ #define SB_ASYNC 0x10 /* ASYNC I/O, need signals */ #define SB_UPCALL 0x20 /* someone wants an upcall */ #define SB_NOINTR 0x40 /* operations not interruptible */ #define SB_AIO 0x80 /* AIO operations queued */ #define SB_KNOTE 0x100 /* kernel note attached */ #define SB_NOCOALESCE 0x200 /* don't coalesce new data into existing mbufs */ #define SB_IN_TOE 0x400 /* socket buffer is in the middle of an operation */ #define SB_AUTOSIZE 0x800 /* automatically size socket buffer */ #define SB_STOP 0x1000 /* backpressure indicator */ #define SB_AIO_RUNNING 0x2000 /* AIO operation running */ #define SBS_CANTSENDMORE 0x0010 /* can't send more data to peer */ #define SBS_CANTRCVMORE 0x0020 /* can't receive more data from peer */ #define SBS_RCVATMARK 0x0040 /* at mark on input */ #if defined(_KERNEL) || defined(_WANT_SOCKET) #include #include #include #include #define SB_MAX (2*1024*1024) /* default for max chars in sockbuf */ struct mbuf; struct sockaddr; struct socket; struct thread; struct selinfo; /* * Variables for socket buffering. * * Locking key to struct sockbuf: * (a) locked by SOCKBUF_LOCK(). */ struct sockbuf { struct mtx sb_mtx; /* sockbuf lock */ struct sx sb_sx; /* prevent I/O interlacing */ struct selinfo *sb_sel; /* process selecting read/write */ short sb_state; /* (a) socket state on sockbuf */ #define sb_startzero sb_mb struct mbuf *sb_mb; /* (a) the mbuf chain */ struct mbuf *sb_mbtail; /* (a) the last mbuf in the chain */ struct mbuf *sb_lastrecord; /* (a) first mbuf of last * record in socket buffer */ struct mbuf *sb_sndptr; /* (a) pointer into mbuf chain */ struct mbuf *sb_fnrdy; /* (a) pointer to first not ready buffer */ u_int sb_sndptroff; /* (a) byte offset of ptr into chain */ u_int sb_acc; /* (a) available chars in buffer */ u_int sb_ccc; /* (a) claimed chars in buffer */ u_int sb_hiwat; /* (a) max actual char count */ u_int sb_mbcnt; /* (a) chars of mbufs used */ u_int sb_mcnt; /* (a) number of mbufs in buffer */ u_int sb_ccnt; /* (a) number of clusters in buffer */ u_int sb_mbmax; /* (a) max chars of mbufs to use */ u_int sb_ctl; /* (a) non-data chars in buffer */ int sb_lowat; /* (a) low water mark */ sbintime_t sb_timeo; /* (a) timeout for read/write */ short sb_flags; /* (a) flags, see below */ int (*sb_upcall)(struct socket *, void *, int); /* (a) */ void *sb_upcallarg; /* (a) */ TAILQ_HEAD(, kaiocb) sb_aiojobq; /* (a) pending AIO ops */ struct task sb_aiotask; /* AIO task */ }; #endif /* defined(_KERNEL) || defined(_WANT_SOCKET) */ #ifdef _KERNEL /* * Per-socket buffer mutex used to protect most fields in the socket * buffer. */ #define SOCKBUF_MTX(_sb) (&(_sb)->sb_mtx) #define SOCKBUF_LOCK_INIT(_sb, _name) \ mtx_init(SOCKBUF_MTX(_sb), _name, NULL, MTX_DEF) #define SOCKBUF_LOCK_DESTROY(_sb) mtx_destroy(SOCKBUF_MTX(_sb)) #define SOCKBUF_LOCK(_sb) mtx_lock(SOCKBUF_MTX(_sb)) #define SOCKBUF_OWNED(_sb) mtx_owned(SOCKBUF_MTX(_sb)) #define SOCKBUF_UNLOCK(_sb) mtx_unlock(SOCKBUF_MTX(_sb)) #define SOCKBUF_LOCK_ASSERT(_sb) mtx_assert(SOCKBUF_MTX(_sb), MA_OWNED) #define SOCKBUF_UNLOCK_ASSERT(_sb) mtx_assert(SOCKBUF_MTX(_sb), MA_NOTOWNED) /* * Socket buffer private mbuf(9) flags. */ #define M_NOTREADY M_PROTO1 /* m_data not populated yet */ #define M_BLOCKED M_PROTO2 /* M_NOTREADY in front of m */ #define M_NOTAVAIL (M_NOTREADY | M_BLOCKED) void sbappend(struct sockbuf *sb, struct mbuf *m, int flags); void sbappend_locked(struct sockbuf *sb, struct mbuf *m, int flags); void sbappendstream(struct sockbuf *sb, struct mbuf *m, int flags); void sbappendstream_locked(struct sockbuf *sb, struct mbuf *m, int flags); int sbappendaddr(struct sockbuf *sb, const struct sockaddr *asa, struct mbuf *m0, struct mbuf *control); int sbappendaddr_locked(struct sockbuf *sb, const struct sockaddr *asa, struct mbuf *m0, struct mbuf *control); int sbappendaddr_nospacecheck_locked(struct sockbuf *sb, const struct sockaddr *asa, struct mbuf *m0, struct mbuf *control); int sbappendcontrol(struct sockbuf *sb, struct mbuf *m0, struct mbuf *control); int sbappendcontrol_locked(struct sockbuf *sb, struct mbuf *m0, struct mbuf *control); void sbappendrecord(struct sockbuf *sb, struct mbuf *m0); void sbappendrecord_locked(struct sockbuf *sb, struct mbuf *m0); void sbcompress(struct sockbuf *sb, struct mbuf *m, struct mbuf *n); struct mbuf * sbcreatecontrol(caddr_t p, int size, int type, int level); void sbdestroy(struct sockbuf *sb, struct socket *so); void sbdrop(struct sockbuf *sb, int len); void sbdrop_locked(struct sockbuf *sb, int len); struct mbuf * sbcut_locked(struct sockbuf *sb, int len); void sbdroprecord(struct sockbuf *sb); void sbdroprecord_locked(struct sockbuf *sb); void sbflush(struct sockbuf *sb); void sbflush_locked(struct sockbuf *sb); void sbrelease(struct sockbuf *sb, struct socket *so); void sbrelease_internal(struct sockbuf *sb, struct socket *so); void sbrelease_locked(struct sockbuf *sb, struct socket *so); int sbsetopt(struct socket *so, int cmd, u_long cc); int sbreserve_locked(struct sockbuf *sb, u_long cc, struct socket *so, struct thread *td); struct mbuf * sbsndptr(struct sockbuf *sb, u_int off, u_int len, u_int *moff); struct mbuf * + sbsndptr_noadv(struct sockbuf *sb, u_int off, u_int *moff); +void + sbsndptr_adv(struct sockbuf *sb, struct mbuf *mb, u_int len); +struct mbuf * sbsndmbuf(struct sockbuf *sb, u_int off, u_int *moff); int sbwait(struct sockbuf *sb); int sblock(struct sockbuf *sb, int flags); void sbunlock(struct sockbuf *sb); void sballoc(struct sockbuf *, struct mbuf *); void sbfree(struct sockbuf *, struct mbuf *); int sbready(struct sockbuf *, struct mbuf *, int); /* * Return how much data is available to be taken out of socket * buffer right now. */ static inline u_int sbavail(struct sockbuf *sb) { #if 0 SOCKBUF_LOCK_ASSERT(sb); #endif return (sb->sb_acc); } /* * Return how much data sits there in the socket buffer * It might be that some data is not yet ready to be read. */ static inline u_int sbused(struct sockbuf *sb) { #if 0 SOCKBUF_LOCK_ASSERT(sb); #endif return (sb->sb_ccc); } /* * How much space is there in a socket buffer (so->so_snd or so->so_rcv)? * This is problematical if the fields are unsigned, as the space might * still be negative (ccc > hiwat or mbcnt > mbmax). */ static inline long sbspace(struct sockbuf *sb) { int bleft, mleft; /* size should match sockbuf fields */ #if 0 SOCKBUF_LOCK_ASSERT(sb); #endif if (sb->sb_flags & SB_STOP) return(0); bleft = sb->sb_hiwat - sb->sb_ccc; mleft = sb->sb_mbmax - sb->sb_mbcnt; return ((bleft < mleft) ? bleft : mleft); } #define SB_EMPTY_FIXUP(sb) do { \ if ((sb)->sb_mb == NULL) { \ (sb)->sb_mbtail = NULL; \ (sb)->sb_lastrecord = NULL; \ } \ } while (/*CONSTCOND*/0) #ifdef SOCKBUF_DEBUG void sblastrecordchk(struct sockbuf *, const char *, int); void sblastmbufchk(struct sockbuf *, const char *, int); void sbcheck(struct sockbuf *, const char *, int); #define SBLASTRECORDCHK(sb) sblastrecordchk((sb), __FILE__, __LINE__) #define SBLASTMBUFCHK(sb) sblastmbufchk((sb), __FILE__, __LINE__) #define SBCHECK(sb) sbcheck((sb), __FILE__, __LINE__) #else #define SBLASTRECORDCHK(sb) do {} while (0) #define SBLASTMBUFCHK(sb) do {} while (0) #define SBCHECK(sb) do {} while (0) #endif /* SOCKBUF_DEBUG */ #endif /* _KERNEL */ #endif /* _SYS_SOCKBUF_H_ */ Index: head/sys/sys/time.h =================================================================== --- head/sys/sys/time.h (revision 334803) +++ head/sys/sys/time.h (revision 334804) @@ -1,548 +1,564 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)time.h 8.5 (Berkeley) 5/4/95 * $FreeBSD$ */ #ifndef _SYS_TIME_H_ #define _SYS_TIME_H_ #include #include #include struct timezone { int tz_minuteswest; /* minutes west of Greenwich */ int tz_dsttime; /* type of dst correction */ }; #define DST_NONE 0 /* not on dst */ #define DST_USA 1 /* USA style dst */ #define DST_AUST 2 /* Australian style dst */ #define DST_WET 3 /* Western European dst */ #define DST_MET 4 /* Middle European dst */ #define DST_EET 5 /* Eastern European dst */ #define DST_CAN 6 /* Canada */ #if __BSD_VISIBLE struct bintime { time_t sec; uint64_t frac; }; static __inline void bintime_addx(struct bintime *_bt, uint64_t _x) { uint64_t _u; _u = _bt->frac; _bt->frac += _x; if (_u > _bt->frac) _bt->sec++; } static __inline void bintime_add(struct bintime *_bt, const struct bintime *_bt2) { uint64_t _u; _u = _bt->frac; _bt->frac += _bt2->frac; if (_u > _bt->frac) _bt->sec++; _bt->sec += _bt2->sec; } static __inline void bintime_sub(struct bintime *_bt, const struct bintime *_bt2) { uint64_t _u; _u = _bt->frac; _bt->frac -= _bt2->frac; if (_u < _bt->frac) _bt->sec--; _bt->sec -= _bt2->sec; } static __inline void bintime_mul(struct bintime *_bt, u_int _x) { uint64_t _p1, _p2; _p1 = (_bt->frac & 0xffffffffull) * _x; _p2 = (_bt->frac >> 32) * _x + (_p1 >> 32); _bt->sec *= _x; _bt->sec += (_p2 >> 32); _bt->frac = (_p2 << 32) | (_p1 & 0xffffffffull); } static __inline void bintime_shift(struct bintime *_bt, int _exp) { if (_exp > 0) { _bt->sec <<= _exp; _bt->sec |= _bt->frac >> (64 - _exp); _bt->frac <<= _exp; } else if (_exp < 0) { _bt->frac >>= -_exp; _bt->frac |= (uint64_t)_bt->sec << (64 + _exp); _bt->sec >>= -_exp; } } #define bintime_clear(a) ((a)->sec = (a)->frac = 0) #define bintime_isset(a) ((a)->sec || (a)->frac) #define bintime_cmp(a, b, cmp) \ (((a)->sec == (b)->sec) ? \ ((a)->frac cmp (b)->frac) : \ ((a)->sec cmp (b)->sec)) #define SBT_1S ((sbintime_t)1 << 32) #define SBT_1M (SBT_1S * 60) #define SBT_1MS (SBT_1S / 1000) #define SBT_1US (SBT_1S / 1000000) #define SBT_1NS (SBT_1S / 1000000000) /* beware rounding, see nstosbt() */ #define SBT_MAX 0x7fffffffffffffffLL static __inline int sbintime_getsec(sbintime_t _sbt) { return (_sbt >> 32); } static __inline sbintime_t bttosbt(const struct bintime _bt) { return (((sbintime_t)_bt.sec << 32) + (_bt.frac >> 32)); } static __inline struct bintime sbttobt(sbintime_t _sbt) { struct bintime _bt; _bt.sec = _sbt >> 32; _bt.frac = _sbt << 32; return (_bt); } /* * Decimal<->sbt conversions. Multiplying or dividing by SBT_1NS results in * large roundoff errors which sbttons() and nstosbt() avoid. Millisecond and * microsecond functions are also provided for completeness. */ static __inline int64_t sbttons(sbintime_t _sbt) { return ((1000000000 * _sbt) >> 32); } static __inline sbintime_t nstosbt(int64_t _ns) { return ((_ns * (((uint64_t)1 << 63) / 500000000)) >> 32); } static __inline int64_t sbttous(sbintime_t _sbt) { return ((1000000 * _sbt) >> 32); } static __inline sbintime_t ustosbt(int64_t _us) { return ((_us * (((uint64_t)1 << 63) / 500000)) >> 32); } static __inline int64_t sbttoms(sbintime_t _sbt) { return ((1000 * _sbt) >> 32); } static __inline sbintime_t mstosbt(int64_t _ms) { return ((_ms * (((uint64_t)1 << 63) / 500)) >> 32); } /*- * Background information: * * When converting between timestamps on parallel timescales of differing * resolutions it is historical and scientific practice to round down rather * than doing 4/5 rounding. * * The date changes at midnight, not at noon. * * Even at 15:59:59.999999999 it's not four'o'clock. * * time_second ticks after N.999999999 not after N.4999999999 */ static __inline void bintime2timespec(const struct bintime *_bt, struct timespec *_ts) { _ts->tv_sec = _bt->sec; _ts->tv_nsec = ((uint64_t)1000000000 * (uint32_t)(_bt->frac >> 32)) >> 32; } static __inline void timespec2bintime(const struct timespec *_ts, struct bintime *_bt) { _bt->sec = _ts->tv_sec; /* 18446744073 = int(2^64 / 1000000000) */ _bt->frac = _ts->tv_nsec * (uint64_t)18446744073LL; } static __inline void bintime2timeval(const struct bintime *_bt, struct timeval *_tv) { _tv->tv_sec = _bt->sec; _tv->tv_usec = ((uint64_t)1000000 * (uint32_t)(_bt->frac >> 32)) >> 32; } static __inline void timeval2bintime(const struct timeval *_tv, struct bintime *_bt) { _bt->sec = _tv->tv_sec; /* 18446744073709 = int(2^64 / 1000000) */ _bt->frac = _tv->tv_usec * (uint64_t)18446744073709LL; } static __inline struct timespec sbttots(sbintime_t _sbt) { struct timespec _ts; _ts.tv_sec = _sbt >> 32; _ts.tv_nsec = sbttons((uint32_t)_sbt); return (_ts); } static __inline sbintime_t tstosbt(struct timespec _ts) { return (((sbintime_t)_ts.tv_sec << 32) + nstosbt(_ts.tv_nsec)); } static __inline struct timeval sbttotv(sbintime_t _sbt) { struct timeval _tv; _tv.tv_sec = _sbt >> 32; _tv.tv_usec = sbttous((uint32_t)_sbt); return (_tv); } static __inline sbintime_t tvtosbt(struct timeval _tv) { return (((sbintime_t)_tv.tv_sec << 32) + ustosbt(_tv.tv_usec)); } #endif /* __BSD_VISIBLE */ #ifdef _KERNEL +/* + * Simple macros to convert ticks to milliseconds + * or microseconds and vice-versa. The answer + * will always be at least 1. Note the return + * value is a uint32_t however we step up the + * operations to 64 bit to avoid any overflow/underflow + * problems. + */ +#define TICKS_2_MSEC(t) max(1, (uint32_t)(hz == 1000) ? \ + (t) : (((uint64_t)(t) * (uint64_t)1000)/(uint64_t)hz)) +#define TICKS_2_USEC(t) max(1, (uint32_t)(hz == 1000) ? \ + ((t) * 1000) : (((uint64_t)(t) * (uint64_t)1000000)/(uint64_t)hz)) +#define MSEC_2_TICKS(m) max(1, (uint32_t)((hz == 1000) ? \ + (m) : ((uint64_t)(m) * (uint64_t)hz)/(uint64_t)1000)) +#define USEC_2_TICKS(u) max(1, (uint32_t)((hz == 1000) ? \ + ((u) / 1000) : ((uint64_t)(u) * (uint64_t)hz)/(uint64_t)1000000)) /* Operations on timespecs */ #define timespecclear(tvp) ((tvp)->tv_sec = (tvp)->tv_nsec = 0) #define timespecisset(tvp) ((tvp)->tv_sec || (tvp)->tv_nsec) #define timespeccmp(tvp, uvp, cmp) \ (((tvp)->tv_sec == (uvp)->tv_sec) ? \ ((tvp)->tv_nsec cmp (uvp)->tv_nsec) : \ ((tvp)->tv_sec cmp (uvp)->tv_sec)) #define timespecadd(vvp, uvp) \ do { \ (vvp)->tv_sec += (uvp)->tv_sec; \ (vvp)->tv_nsec += (uvp)->tv_nsec; \ if ((vvp)->tv_nsec >= 1000000000) { \ (vvp)->tv_sec++; \ (vvp)->tv_nsec -= 1000000000; \ } \ } while (0) #define timespecsub(vvp, uvp) \ do { \ (vvp)->tv_sec -= (uvp)->tv_sec; \ (vvp)->tv_nsec -= (uvp)->tv_nsec; \ if ((vvp)->tv_nsec < 0) { \ (vvp)->tv_sec--; \ (vvp)->tv_nsec += 1000000000; \ } \ } while (0) /* Operations on timevals. */ #define timevalclear(tvp) ((tvp)->tv_sec = (tvp)->tv_usec = 0) #define timevalisset(tvp) ((tvp)->tv_sec || (tvp)->tv_usec) #define timevalcmp(tvp, uvp, cmp) \ (((tvp)->tv_sec == (uvp)->tv_sec) ? \ ((tvp)->tv_usec cmp (uvp)->tv_usec) : \ ((tvp)->tv_sec cmp (uvp)->tv_sec)) /* timevaladd and timevalsub are not inlined */ #endif /* _KERNEL */ #ifndef _KERNEL /* NetBSD/OpenBSD compatible interfaces */ #define timerclear(tvp) ((tvp)->tv_sec = (tvp)->tv_usec = 0) #define timerisset(tvp) ((tvp)->tv_sec || (tvp)->tv_usec) #define timercmp(tvp, uvp, cmp) \ (((tvp)->tv_sec == (uvp)->tv_sec) ? \ ((tvp)->tv_usec cmp (uvp)->tv_usec) : \ ((tvp)->tv_sec cmp (uvp)->tv_sec)) #define timeradd(tvp, uvp, vvp) \ do { \ (vvp)->tv_sec = (tvp)->tv_sec + (uvp)->tv_sec; \ (vvp)->tv_usec = (tvp)->tv_usec + (uvp)->tv_usec; \ if ((vvp)->tv_usec >= 1000000) { \ (vvp)->tv_sec++; \ (vvp)->tv_usec -= 1000000; \ } \ } while (0) #define timersub(tvp, uvp, vvp) \ do { \ (vvp)->tv_sec = (tvp)->tv_sec - (uvp)->tv_sec; \ (vvp)->tv_usec = (tvp)->tv_usec - (uvp)->tv_usec; \ if ((vvp)->tv_usec < 0) { \ (vvp)->tv_sec--; \ (vvp)->tv_usec += 1000000; \ } \ } while (0) #endif /* * Names of the interval timers, and structure * defining a timer setting. */ #define ITIMER_REAL 0 #define ITIMER_VIRTUAL 1 #define ITIMER_PROF 2 struct itimerval { struct timeval it_interval; /* timer interval */ struct timeval it_value; /* current value */ }; /* * Getkerninfo clock information structure */ struct clockinfo { int hz; /* clock frequency */ int tick; /* micro-seconds per hz tick */ int spare; int stathz; /* statistics clock frequency */ int profhz; /* profiling clock frequency */ }; /* These macros are also in time.h. */ #ifndef CLOCK_REALTIME #define CLOCK_REALTIME 0 #define CLOCK_VIRTUAL 1 #define CLOCK_PROF 2 #define CLOCK_MONOTONIC 4 #define CLOCK_UPTIME 5 /* FreeBSD-specific. */ #define CLOCK_UPTIME_PRECISE 7 /* FreeBSD-specific. */ #define CLOCK_UPTIME_FAST 8 /* FreeBSD-specific. */ #define CLOCK_REALTIME_PRECISE 9 /* FreeBSD-specific. */ #define CLOCK_REALTIME_FAST 10 /* FreeBSD-specific. */ #define CLOCK_MONOTONIC_PRECISE 11 /* FreeBSD-specific. */ #define CLOCK_MONOTONIC_FAST 12 /* FreeBSD-specific. */ #define CLOCK_SECOND 13 /* FreeBSD-specific. */ #define CLOCK_THREAD_CPUTIME_ID 14 #define CLOCK_PROCESS_CPUTIME_ID 15 #endif #ifndef TIMER_ABSTIME #define TIMER_RELTIME 0x0 /* relative timer */ #define TIMER_ABSTIME 0x1 /* absolute timer */ #endif #if __BSD_VISIBLE #define CPUCLOCK_WHICH_PID 0 #define CPUCLOCK_WHICH_TID 1 #endif #ifdef _KERNEL /* * Kernel to clock driver interface. */ void inittodr(time_t base); void resettodr(void); extern volatile time_t time_second; extern volatile time_t time_uptime; extern struct bintime tc_tick_bt; extern sbintime_t tc_tick_sbt; extern struct bintime tick_bt; extern sbintime_t tick_sbt; extern int tc_precexp; extern int tc_timepercentage; extern struct bintime bt_timethreshold; extern struct bintime bt_tickthreshold; extern sbintime_t sbt_timethreshold; extern sbintime_t sbt_tickthreshold; extern volatile int rtc_generation; /* * Functions for looking at our clock: [get]{bin,nano,micro}[up]time() * * Functions without the "get" prefix returns the best timestamp * we can produce in the given format. * * "bin" == struct bintime == seconds + 64 bit fraction of seconds. * "nano" == struct timespec == seconds + nanoseconds. * "micro" == struct timeval == seconds + microseconds. * * Functions containing "up" returns time relative to boot and * should be used for calculating time intervals. * * Functions without "up" returns UTC time. * * Functions with the "get" prefix returns a less precise result * much faster than the functions without "get" prefix and should * be used where a precision of 1/hz seconds is acceptable or where * performance is priority. (NB: "precision", _not_ "resolution" !) */ void binuptime(struct bintime *bt); void nanouptime(struct timespec *tsp); void microuptime(struct timeval *tvp); static __inline sbintime_t sbinuptime(void) { struct bintime _bt; binuptime(&_bt); return (bttosbt(_bt)); } void bintime(struct bintime *bt); void nanotime(struct timespec *tsp); void microtime(struct timeval *tvp); void getbinuptime(struct bintime *bt); void getnanouptime(struct timespec *tsp); void getmicrouptime(struct timeval *tvp); static __inline sbintime_t getsbinuptime(void) { struct bintime _bt; getbinuptime(&_bt); return (bttosbt(_bt)); } void getbintime(struct bintime *bt); void getnanotime(struct timespec *tsp); void getmicrotime(struct timeval *tvp); void getboottime(struct timeval *boottime); void getboottimebin(struct bintime *boottimebin); /* Other functions */ int itimerdecr(struct itimerval *itp, int usec); int itimerfix(struct timeval *tv); int ppsratecheck(struct timeval *, int *, int); int ratecheck(struct timeval *, const struct timeval *); void timevaladd(struct timeval *t1, const struct timeval *t2); void timevalsub(struct timeval *t1, const struct timeval *t2); int tvtohz(struct timeval *tv); #define TC_DEFAULTPERC 5 #define BT2FREQ(bt) \ (((uint64_t)0x8000000000000000 + ((bt)->frac >> 2)) / \ ((bt)->frac >> 1)) #define SBT2FREQ(sbt) ((SBT_1S + ((sbt) >> 1)) / (sbt)) #define FREQ2BT(freq, bt) \ { \ (bt)->sec = 0; \ (bt)->frac = ((uint64_t)0x8000000000000000 / (freq)) << 1; \ } #define TIMESEL(sbt, sbt2) \ (((sbt2) >= sbt_timethreshold) ? \ ((*(sbt) = getsbinuptime()), 1) : ((*(sbt) = sbinuptime()), 0)) #else /* !_KERNEL */ #include #include #include __BEGIN_DECLS int setitimer(int, const struct itimerval *, struct itimerval *); int utimes(const char *, const struct timeval *); #if __BSD_VISIBLE int adjtime(const struct timeval *, struct timeval *); int clock_getcpuclockid2(id_t, int, clockid_t *); int futimes(int, const struct timeval *); int futimesat(int, const char *, const struct timeval [2]); int lutimes(const char *, const struct timeval *); int settimeofday(const struct timeval *, const struct timezone *); #endif #if __XSI_VISIBLE int getitimer(int, struct itimerval *); int gettimeofday(struct timeval *, struct timezone *); #endif __END_DECLS #endif /* !_KERNEL */ #endif /* !_SYS_TIME_H_ */