Index: head/sys/kern/uipc_sockbuf.c =================================================================== --- head/sys/kern/uipc_sockbuf.c (revision 131150) +++ head/sys/kern/uipc_sockbuf.c (revision 131151) @@ -1,1432 +1,1429 @@ /* * Copyright (c) 1982, 1986, 1988, 1990, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)uipc_socket2.c 8.1 (Berkeley) 6/10/93 */ #include __FBSDID("$FreeBSD$"); #include "opt_mac.h" #include "opt_param.h" #include #include /* for aio_swake proto */ #include #include #include /* for maxfiles */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include int maxsockets; void (*aio_swake)(struct socket *, struct sockbuf *); /* * Primitive routines for operating on sockets and socket buffers */ u_long sb_max = SB_MAX; static u_long sb_max_adj = SB_MAX * MCLBYTES / (MSIZE + MCLBYTES); /* adjusted sb_max */ static u_long sb_efficiency = 8; /* parameter for sbreserve() */ /* * Procedures to manipulate state flags of socket * and do appropriate wakeups. Normal sequence from the * active (originating) side is that soisconnecting() is * called during processing of connect() call, * resulting in an eventual call to soisconnected() if/when the * connection is established. When the connection is torn down * soisdisconnecting() is called during processing of disconnect() call, * and soisdisconnected() is called when the connection to the peer * is totally severed. The semantics of these routines are such that * connectionless protocols can call soisconnected() and soisdisconnected() * only, bypassing the in-progress calls when setting up a ``connection'' * takes no time. * * From the passive side, a socket is created with * two queues of sockets: so_incomp for connections in progress * and so_comp for connections already made and awaiting user acceptance. * As a protocol is preparing incoming connections, it creates a socket * structure queued on so_incomp by calling sonewconn(). When the connection * is established, soisconnected() is called, and transfers the * socket structure to so_comp, making it available to accept(). * * If a socket is closed with sockets on either * so_incomp or so_comp, these sockets are dropped. * * If higher level protocols are implemented in * the kernel, the wakeups done here will sometimes * cause software-interrupt process scheduling. */ void soisconnecting(so) register struct socket *so; { SOCK_LOCK(so); so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING); so->so_state |= SS_ISCONNECTING; SOCK_UNLOCK(so); } void soisconnected(so) struct socket *so; { struct socket *head; SOCK_LOCK(so); so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING|SS_ISCONFIRMING); so->so_state |= SS_ISCONNECTED; SOCK_UNLOCK(so); ACCEPT_LOCK(); head = so->so_head; if (head != NULL && (so->so_qstate & SQ_INCOMP)) { if ((so->so_options & SO_ACCEPTFILTER) == 0) { TAILQ_REMOVE(&head->so_incomp, so, so_list); head->so_incqlen--; so->so_qstate &= ~SQ_INCOMP; TAILQ_INSERT_TAIL(&head->so_comp, so, so_list); head->so_qlen++; so->so_qstate |= SQ_COMP; ACCEPT_UNLOCK(); sorwakeup(head); wakeup_one(&head->so_timeo); } else { ACCEPT_UNLOCK(); SOCK_LOCK(so); so->so_upcall = head->so_accf->so_accept_filter->accf_callback; so->so_upcallarg = head->so_accf->so_accept_filter_arg; so->so_rcv.sb_flags |= SB_UPCALL; so->so_options &= ~SO_ACCEPTFILTER; SOCK_UNLOCK(so); so->so_upcall(so, so->so_upcallarg, M_TRYWAIT); } return; } ACCEPT_UNLOCK(); wakeup(&so->so_timeo); sorwakeup(so); sowwakeup(so); } void soisdisconnecting(so) register struct socket *so; { /* * XXXRW: This code separately acquires SOCK_LOCK(so) and * SOCKBUF_LOCK(&so->so_rcv) even though they are the same mutex to * avoid introducing the assumption that they are the same. */ SOCK_LOCK(so); so->so_state &= ~SS_ISCONNECTING; so->so_state |= SS_ISDISCONNECTING; SOCK_UNLOCK(so); SOCKBUF_LOCK(&so->so_rcv); so->so_rcv.sb_state |= SBS_CANTRCVMORE; - SOCKBUF_UNLOCK(&so->so_rcv); + sorwakeup_locked(so); SOCKBUF_LOCK(&so->so_snd); so->so_snd.sb_state |= SBS_CANTSENDMORE; - SOCKBUF_UNLOCK(&so->so_snd); + sowwakeup_locked(so); wakeup(&so->so_timeo); - sowwakeup(so); - sorwakeup(so); } void soisdisconnected(so) register struct socket *so; { /* * XXXRW: This code separately acquires SOCK_LOCK(so) and * SOCKBUF_LOCK(&so->so_rcv) even though they are the same mutex to * avoid introducing the assumption that they are the same. */ + /* XXXRW: so_state locking? */ SOCK_LOCK(so); so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING); so->so_state |= SS_ISDISCONNECTED; SOCK_UNLOCK(so); SOCKBUF_LOCK(&so->so_rcv); so->so_rcv.sb_state |= SBS_CANTRCVMORE; - SOCKBUF_UNLOCK(&so->so_rcv); + sorwakeup_locked(so); SOCKBUF_LOCK(&so->so_snd); so->so_snd.sb_state |= SBS_CANTSENDMORE; sbdrop_locked(&so->so_snd, so->so_snd.sb_cc); - SOCKBUF_UNLOCK(&so->so_snd); + sowwakeup_locked(so); wakeup(&so->so_timeo); - sowwakeup(so); - sorwakeup(so); } /* * When an attempt at a new connection is noted on a socket * which accepts connections, sonewconn is called. If the * connection is possible (subject to space constraints, etc.) * then we allocate a new structure, propoerly linked into the * data structure of the original socket, and return this. * Connstatus may be 0, or SO_ISCONFIRMING, or SO_ISCONNECTED. * * note: the ref count on the socket is 0 on return */ struct socket * sonewconn(head, connstatus) register struct socket *head; int connstatus; { register struct socket *so; int over; ACCEPT_LOCK(); over = (head->so_qlen > 3 * head->so_qlimit / 2); ACCEPT_UNLOCK(); if (over) return ((struct socket *)0); so = soalloc(M_NOWAIT); if (so == NULL) return ((struct socket *)0); if ((head->so_options & SO_ACCEPTFILTER) != 0) connstatus = 0; so->so_head = head; so->so_type = head->so_type; so->so_options = head->so_options &~ SO_ACCEPTCONN; so->so_linger = head->so_linger; so->so_state = head->so_state | SS_NOFDREF; so->so_proto = head->so_proto; so->so_timeo = head->so_timeo; so->so_cred = crhold(head->so_cred); #ifdef MAC SOCK_LOCK(head); mac_create_socket_from_socket(head, so); SOCK_UNLOCK(head); #endif if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat) || (*so->so_proto->pr_usrreqs->pru_attach)(so, 0, NULL)) { sodealloc(so); return ((struct socket *)0); } ACCEPT_LOCK(); if (connstatus) { TAILQ_INSERT_TAIL(&head->so_comp, so, so_list); so->so_qstate |= SQ_COMP; head->so_qlen++; } else { /* * XXXRW: Keep removing sockets from the head until there's * room for us to insert on the tail. In pre-locking * revisions, this was a simple if(), but as we could be * racing with other threads and soabort() requires dropping * locks, we must loop waiting for the condition to be true. */ while (head->so_incqlen > head->so_qlimit) { struct socket *sp; sp = TAILQ_FIRST(&head->so_incomp); TAILQ_REMOVE(&so->so_incomp, sp, so_list); head->so_incqlen--; sp->so_qstate &= ~SQ_INCOMP; sp->so_head = NULL; ACCEPT_UNLOCK(); (void) soabort(sp); ACCEPT_LOCK(); } TAILQ_INSERT_TAIL(&head->so_incomp, so, so_list); so->so_qstate |= SQ_INCOMP; head->so_incqlen++; } ACCEPT_UNLOCK(); if (connstatus) { so->so_state |= connstatus; sorwakeup(head); wakeup_one(&head->so_timeo); } return (so); } /* * Socantsendmore indicates that no more data will be sent on the * socket; it would normally be applied to a socket when the user * informs the system that no more data is to be sent, by the protocol * code (in case PRU_SHUTDOWN). Socantrcvmore indicates that no more data * will be received, and will normally be applied to the socket by a * protocol when it detects that the peer will send no more data. * Data queued for reading in the socket may yet be read. */ void socantsendmore_locked(so) struct socket *so; { SOCKBUF_LOCK_ASSERT(&so->so_snd); so->so_snd.sb_state |= SBS_CANTSENDMORE; sowwakeup_locked(so); mtx_assert(SOCKBUF_MTX(&so->so_snd), MA_NOTOWNED); } void socantsendmore(so) struct socket *so; { SOCKBUF_LOCK(&so->so_snd); socantsendmore_locked(so); mtx_assert(SOCKBUF_MTX(&so->so_snd), MA_NOTOWNED); } void socantrcvmore_locked(so) struct socket *so; { SOCKBUF_LOCK_ASSERT(&so->so_rcv); so->so_rcv.sb_state |= SBS_CANTRCVMORE; sorwakeup_locked(so); mtx_assert(SOCKBUF_MTX(&so->so_rcv), MA_NOTOWNED); } void socantrcvmore(so) struct socket *so; { SOCKBUF_LOCK(&so->so_rcv); socantrcvmore_locked(so); mtx_assert(SOCKBUF_MTX(&so->so_rcv), MA_NOTOWNED); } /* * Wait for data to arrive at/drain from a socket buffer. */ int sbwait(sb) struct sockbuf *sb; { SOCKBUF_LOCK_ASSERT(sb); sb->sb_flags |= SB_WAIT; return (msleep(&sb->sb_cc, &sb->sb_mtx, (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK | PCATCH, "sbwait", sb->sb_timeo)); } /* * Lock a sockbuf already known to be locked; * return any error returned from sleep (EINTR). */ int sb_lock(sb) register struct sockbuf *sb; { int error; SOCKBUF_LOCK_ASSERT(sb); while (sb->sb_flags & SB_LOCK) { sb->sb_flags |= SB_WANT; error = msleep(&sb->sb_flags, &sb->sb_mtx, (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK|PCATCH, "sblock", 0); if (error) return (error); } sb->sb_flags |= SB_LOCK; return (0); } /* * Wakeup processes waiting on a socket buffer. Do asynchronous * notification via SIGIO if the socket has the SS_ASYNC flag set. * * Called with the socket buffer lock held; will release the lock by the end * of the function. This allows the caller to acquire the socket buffer lock * while testing for the need for various sorts of wakeup and hold it through * to the point where it's no longer required. We currently hold the lock * through calls out to other subsystems (with the exception of kqueue), and * then release it to avoid lock order issues. It's not clear that's * correct. */ void sowakeup(so, sb) register struct socket *so; register struct sockbuf *sb; { SOCKBUF_LOCK_ASSERT(sb); selwakeuppri(&sb->sb_sel, PSOCK); sb->sb_flags &= ~SB_SEL; if (sb->sb_flags & SB_WAIT) { sb->sb_flags &= ~SB_WAIT; wakeup(&sb->sb_cc); } KNOTE(&sb->sb_sel.si_note, 0); SOCKBUF_UNLOCK(sb); if ((so->so_state & SS_ASYNC) && so->so_sigio != NULL) pgsigio(&so->so_sigio, SIGIO, 0); if (sb->sb_flags & SB_UPCALL) (*so->so_upcall)(so, so->so_upcallarg, M_DONTWAIT); if (sb->sb_flags & SB_AIO) aio_swake(so, sb); mtx_assert(SOCKBUF_MTX(sb), MA_NOTOWNED); } /* * Socket buffer (struct sockbuf) utility routines. * * Each socket contains two socket buffers: one for sending data and * one for receiving data. Each buffer contains a queue of mbufs, * information about the number of mbufs and amount of data in the * queue, and other fields allowing select() statements and notification * on data availability to be implemented. * * Data stored in a socket buffer is maintained as a list of records. * Each record is a list of mbufs chained together with the m_next * field. Records are chained together with the m_nextpkt field. The upper * level routine soreceive() expects the following conventions to be * observed when placing information in the receive buffer: * * 1. If the protocol requires each message be preceded by the sender's * name, then a record containing that name must be present before * any associated data (mbuf's must be of type MT_SONAME). * 2. If the protocol supports the exchange of ``access rights'' (really * just additional data associated with the message), and there are * ``rights'' to be received, then a record containing this data * should be present (mbuf's must be of type MT_RIGHTS). * 3. If a name or rights record exists, then it must be followed by * a data record, perhaps of zero length. * * Before using a new socket structure it is first necessary to reserve * buffer space to the socket, by calling sbreserve(). This should commit * some of the available buffer space in the system buffer pool for the * socket (currently, it does nothing but enforce limits). The space * should be released by calling sbrelease() when the socket is destroyed. */ int soreserve(so, sndcc, rcvcc) register struct socket *so; u_long sndcc, rcvcc; { struct thread *td = curthread; SOCKBUF_LOCK(&so->so_snd); SOCKBUF_LOCK(&so->so_rcv); if (sbreserve_locked(&so->so_snd, sndcc, so, td) == 0) goto bad; if (sbreserve_locked(&so->so_rcv, rcvcc, so, td) == 0) goto bad2; if (so->so_rcv.sb_lowat == 0) so->so_rcv.sb_lowat = 1; if (so->so_snd.sb_lowat == 0) so->so_snd.sb_lowat = MCLBYTES; if (so->so_snd.sb_lowat > so->so_snd.sb_hiwat) so->so_snd.sb_lowat = so->so_snd.sb_hiwat; SOCKBUF_UNLOCK(&so->so_rcv); SOCKBUF_UNLOCK(&so->so_snd); return (0); bad2: sbrelease_locked(&so->so_snd, so); bad: SOCKBUF_UNLOCK(&so->so_rcv); SOCKBUF_UNLOCK(&so->so_snd); return (ENOBUFS); } static int sysctl_handle_sb_max(SYSCTL_HANDLER_ARGS) { int error = 0; u_long old_sb_max = sb_max; error = SYSCTL_OUT(req, arg1, sizeof(u_long)); if (error || !req->newptr) return (error); error = SYSCTL_IN(req, arg1, sizeof(u_long)); if (error) return (error); if (sb_max < MSIZE + MCLBYTES) { sb_max = old_sb_max; return (EINVAL); } sb_max_adj = (u_quad_t)sb_max * MCLBYTES / (MSIZE + MCLBYTES); return (0); } /* * Allot mbufs to a sockbuf. * Attempt to scale mbmax so that mbcnt doesn't become limiting * if buffering efficiency is near the normal case. */ int sbreserve_locked(sb, cc, so, td) struct sockbuf *sb; u_long cc; struct socket *so; struct thread *td; { rlim_t sbsize_limit; SOCKBUF_LOCK_ASSERT(sb); /* * td will only be NULL when we're in an interrupt * (e.g. in tcp_input()) */ if (cc > sb_max_adj) return (0); if (td != NULL) { PROC_LOCK(td->td_proc); sbsize_limit = lim_cur(td->td_proc, RLIMIT_SBSIZE); PROC_UNLOCK(td->td_proc); } else sbsize_limit = RLIM_INFINITY; if (!chgsbsize(so->so_cred->cr_uidinfo, &sb->sb_hiwat, cc, sbsize_limit)) return (0); sb->sb_mbmax = min(cc * sb_efficiency, sb_max); if (sb->sb_lowat > sb->sb_hiwat) sb->sb_lowat = sb->sb_hiwat; return (1); } int sbreserve(sb, cc, so, td) struct sockbuf *sb; u_long cc; struct socket *so; struct thread *td; { int error; SOCKBUF_LOCK(sb); error = sbreserve_locked(sb, cc, so, td); SOCKBUF_UNLOCK(sb); return (error); } /* * Free mbufs held by a socket, and reserved mbuf space. */ void sbrelease_locked(sb, so) struct sockbuf *sb; struct socket *so; { SOCKBUF_LOCK_ASSERT(sb); sbflush_locked(sb); (void)chgsbsize(so->so_cred->cr_uidinfo, &sb->sb_hiwat, 0, RLIM_INFINITY); sb->sb_mbmax = 0; } void sbrelease(sb, so) struct sockbuf *sb; struct socket *so; { SOCKBUF_LOCK(sb); sbrelease_locked(sb, so); SOCKBUF_UNLOCK(sb); } /* * Routines to add and remove * data from an mbuf queue. * * The routines sbappend() or sbappendrecord() are normally called to * append new mbufs to a socket buffer, after checking that adequate * space is available, comparing the function sbspace() with the amount * of data to be added. sbappendrecord() differs from sbappend() in * that data supplied is treated as the beginning of a new record. * To place a sender's address, optional access rights, and data in a * socket receive buffer, sbappendaddr() should be used. To place * access rights and data in a socket receive buffer, sbappendrights() * should be used. In either case, the new data begins a new record. * Note that unlike sbappend() and sbappendrecord(), these routines check * for the caller that there will be enough space to store the data. * Each fails if there is not enough space, or if it cannot find mbufs * to store additional information in. * * Reliable protocols may use the socket send buffer to hold data * awaiting acknowledgement. Data is normally copied from a socket * send buffer in a protocol with m_copy for output to a peer, * and then removing the data from the socket buffer with sbdrop() * or sbdroprecord() when the data is acknowledged by the peer. */ #ifdef SOCKBUF_DEBUG void sblastrecordchk(struct sockbuf *sb, const char *file, int line) { struct mbuf *m = sb->sb_mb; SOCKBUF_LOCK_ASSERT(sb); while (m && m->m_nextpkt) m = m->m_nextpkt; if (m != sb->sb_lastrecord) { printf("%s: sb_mb %p sb_lastrecord %p last %p\n", __func__, sb->sb_mb, sb->sb_lastrecord, m); printf("packet chain:\n"); for (m = sb->sb_mb; m != NULL; m = m->m_nextpkt) printf("\t%p\n", m); panic("%s from %s:%u", __func__, file, line); } } void sblastmbufchk(struct sockbuf *sb, const char *file, int line) { struct mbuf *m = sb->sb_mb; struct mbuf *n; SOCKBUF_LOCK_ASSERT(sb); while (m && m->m_nextpkt) m = m->m_nextpkt; while (m && m->m_next) m = m->m_next; if (m != sb->sb_mbtail) { printf("%s: sb_mb %p sb_mbtail %p last %p\n", __func__, sb->sb_mb, sb->sb_mbtail, m); printf("packet tree:\n"); for (m = sb->sb_mb; m != NULL; m = m->m_nextpkt) { printf("\t"); for (n = m; n != NULL; n = n->m_next) printf("%p ", n); printf("\n"); } panic("%s from %s:%u", __func__, file, line); } } #endif /* SOCKBUF_DEBUG */ #define SBLINKRECORD(sb, m0) do { \ SOCKBUF_LOCK_ASSERT(sb); \ if ((sb)->sb_lastrecord != NULL) \ (sb)->sb_lastrecord->m_nextpkt = (m0); \ else \ (sb)->sb_mb = (m0); \ (sb)->sb_lastrecord = (m0); \ } while (/*CONSTCOND*/0) /* * Append mbuf chain m to the last record in the * socket buffer sb. The additional space associated * the mbuf chain is recorded in sb. Empty mbufs are * discarded and mbufs are compacted where possible. */ void sbappend_locked(sb, m) struct sockbuf *sb; struct mbuf *m; { register struct mbuf *n; SOCKBUF_LOCK_ASSERT(sb); if (m == 0) return; SBLASTRECORDCHK(sb); n = sb->sb_mb; if (n) { while (n->m_nextpkt) n = n->m_nextpkt; do { if (n->m_flags & M_EOR) { sbappendrecord_locked(sb, m); /* XXXXXX!!!! */ return; } } while (n->m_next && (n = n->m_next)); } else { /* * XXX Would like to simply use sb_mbtail here, but * XXX I need to verify that I won't miss an EOR that * XXX way. */ if ((n = sb->sb_lastrecord) != NULL) { do { if (n->m_flags & M_EOR) { sbappendrecord_locked(sb, m); /* XXXXXX!!!! */ return; } } while (n->m_next && (n = n->m_next)); } else { /* * If this is the first record in the socket buffer, * it's also the last record. */ sb->sb_lastrecord = m; } } sbcompress(sb, m, n); SBLASTRECORDCHK(sb); } /* * Append mbuf chain m to the last record in the * socket buffer sb. The additional space associated * the mbuf chain is recorded in sb. Empty mbufs are * discarded and mbufs are compacted where possible. */ void sbappend(sb, m) struct sockbuf *sb; struct mbuf *m; { SOCKBUF_LOCK(sb); sbappend_locked(sb, m); SOCKBUF_UNLOCK(sb); } /* * This version of sbappend() should only be used when the caller * absolutely knows that there will never be more than one record * in the socket buffer, that is, a stream protocol (such as TCP). */ void sbappendstream_locked(struct sockbuf *sb, struct mbuf *m) { SOCKBUF_LOCK_ASSERT(sb); KASSERT(m->m_nextpkt == NULL,("sbappendstream 0")); KASSERT(sb->sb_mb == sb->sb_lastrecord,("sbappendstream 1")); SBLASTMBUFCHK(sb); sbcompress(sb, m, sb->sb_mbtail); sb->sb_lastrecord = sb->sb_mb; SBLASTRECORDCHK(sb); } /* * This version of sbappend() should only be used when the caller * absolutely knows that there will never be more than one record * in the socket buffer, that is, a stream protocol (such as TCP). */ void sbappendstream(struct sockbuf *sb, struct mbuf *m) { SOCKBUF_LOCK(sb); sbappendstream_locked(sb, m); SOCKBUF_UNLOCK(sb); } #ifdef SOCKBUF_DEBUG void sbcheck(sb) struct sockbuf *sb; { struct mbuf *m; struct mbuf *n = 0; u_long len = 0, mbcnt = 0; SOCKBUF_LOCK_ASSERT(sb); for (m = sb->sb_mb; m; m = n) { n = m->m_nextpkt; for (; m; m = m->m_next) { len += m->m_len; mbcnt += MSIZE; if (m->m_flags & M_EXT) /*XXX*/ /* pretty sure this is bogus */ mbcnt += m->m_ext.ext_size; } } if (len != sb->sb_cc || mbcnt != sb->sb_mbcnt) { printf("cc %ld != %u || mbcnt %ld != %u\n", len, sb->sb_cc, mbcnt, sb->sb_mbcnt); panic("sbcheck"); } } #endif /* * As above, except the mbuf chain * begins a new record. */ void sbappendrecord_locked(sb, m0) register struct sockbuf *sb; register struct mbuf *m0; { register struct mbuf *m; SOCKBUF_LOCK_ASSERT(sb); if (m0 == 0) return; m = sb->sb_mb; if (m) while (m->m_nextpkt) m = m->m_nextpkt; /* * Put the first mbuf on the queue. * Note this permits zero length records. */ sballoc(sb, m0); SBLASTRECORDCHK(sb); SBLINKRECORD(sb, m0); if (m) m->m_nextpkt = m0; else sb->sb_mb = m0; m = m0->m_next; m0->m_next = 0; if (m && (m0->m_flags & M_EOR)) { m0->m_flags &= ~M_EOR; m->m_flags |= M_EOR; } sbcompress(sb, m, m0); } /* * As above, except the mbuf chain * begins a new record. */ void sbappendrecord(sb, m0) register struct sockbuf *sb; register struct mbuf *m0; { SOCKBUF_LOCK(sb); sbappendrecord_locked(sb, m0); SOCKBUF_UNLOCK(sb); } /* * As above except that OOB data * is inserted at the beginning of the sockbuf, * but after any other OOB data. */ void sbinsertoob_locked(sb, m0) register struct sockbuf *sb; register struct mbuf *m0; { register struct mbuf *m; register struct mbuf **mp; SOCKBUF_LOCK_ASSERT(sb); if (m0 == 0) return; for (mp = &sb->sb_mb; *mp ; mp = &((*mp)->m_nextpkt)) { m = *mp; again: switch (m->m_type) { case MT_OOBDATA: continue; /* WANT next train */ case MT_CONTROL: m = m->m_next; if (m) goto again; /* inspect THIS train further */ } break; } /* * Put the first mbuf on the queue. * Note this permits zero length records. */ sballoc(sb, m0); m0->m_nextpkt = *mp; *mp = m0; m = m0->m_next; m0->m_next = 0; if (m && (m0->m_flags & M_EOR)) { m0->m_flags &= ~M_EOR; m->m_flags |= M_EOR; } sbcompress(sb, m, m0); } /* * As above except that OOB data * is inserted at the beginning of the sockbuf, * but after any other OOB data. */ void sbinsertoob(sb, m0) register struct sockbuf *sb; register struct mbuf *m0; { SOCKBUF_LOCK(sb); sbinsertoob_locked(sb, m0); SOCKBUF_UNLOCK(sb); } /* * Append address and data, and optionally, control (ancillary) data * to the receive queue of a socket. If present, * m0 must include a packet header with total length. * Returns 0 if no space in sockbuf or insufficient mbufs. */ int sbappendaddr_locked(sb, asa, m0, control) struct sockbuf *sb; const struct sockaddr *asa; struct mbuf *m0, *control; { struct mbuf *m, *n, *nlast; int space = asa->sa_len; SOCKBUF_LOCK_ASSERT(sb); if (m0 && (m0->m_flags & M_PKTHDR) == 0) panic("sbappendaddr_locked"); if (m0) space += m0->m_pkthdr.len; space += m_length(control, &n); if (space > sbspace(sb)) return (0); #if MSIZE <= 256 if (asa->sa_len > MLEN) return (0); #endif MGET(m, M_DONTWAIT, MT_SONAME); if (m == 0) return (0); m->m_len = asa->sa_len; bcopy(asa, mtod(m, caddr_t), asa->sa_len); if (n) n->m_next = m0; /* concatenate data to control */ else control = m0; m->m_next = control; for (n = m; n->m_next != NULL; n = n->m_next) sballoc(sb, n); sballoc(sb, n); nlast = n; SBLINKRECORD(sb, m); sb->sb_mbtail = nlast; SBLASTMBUFCHK(sb); SBLASTRECORDCHK(sb); return (1); } /* * Append address and data, and optionally, control (ancillary) data * to the receive queue of a socket. If present, * m0 must include a packet header with total length. * Returns 0 if no space in sockbuf or insufficient mbufs. */ int sbappendaddr(sb, asa, m0, control) struct sockbuf *sb; const struct sockaddr *asa; struct mbuf *m0, *control; { int retval; SOCKBUF_LOCK(sb); retval = sbappendaddr_locked(sb, asa, m0, control); SOCKBUF_UNLOCK(sb); return (retval); } int sbappendcontrol_locked(sb, m0, control) struct sockbuf *sb; struct mbuf *control, *m0; { struct mbuf *m, *n, *mlast; int space; SOCKBUF_LOCK_ASSERT(sb); if (control == 0) panic("sbappendcontrol_locked"); space = m_length(control, &n) + m_length(m0, NULL); if (space > sbspace(sb)) return (0); n->m_next = m0; /* concatenate data to control */ SBLASTRECORDCHK(sb); for (m = control; m->m_next; m = m->m_next) sballoc(sb, m); sballoc(sb, m); mlast = m; SBLINKRECORD(sb, control); sb->sb_mbtail = mlast; SBLASTMBUFCHK(sb); SBLASTRECORDCHK(sb); return (1); } int sbappendcontrol(sb, m0, control) struct sockbuf *sb; struct mbuf *control, *m0; { int retval; SOCKBUF_LOCK(sb); retval = sbappendcontrol_locked(sb, m0, control); SOCKBUF_UNLOCK(sb); return (retval); } /* * Compress mbuf chain m into the socket * buffer sb following mbuf n. If n * is null, the buffer is presumed empty. */ void sbcompress(sb, m, n) register struct sockbuf *sb; register struct mbuf *m, *n; { register int eor = 0; register struct mbuf *o; SOCKBUF_LOCK_ASSERT(sb); while (m) { eor |= m->m_flags & M_EOR; if (m->m_len == 0 && (eor == 0 || (((o = m->m_next) || (o = n)) && o->m_type == m->m_type))) { if (sb->sb_lastrecord == m) sb->sb_lastrecord = m->m_next; m = m_free(m); continue; } if (n && (n->m_flags & M_EOR) == 0 && M_WRITABLE(n) && m->m_len <= MCLBYTES / 4 && /* XXX: Don't copy too much */ m->m_len <= M_TRAILINGSPACE(n) && n->m_type == m->m_type) { bcopy(mtod(m, caddr_t), mtod(n, caddr_t) + n->m_len, (unsigned)m->m_len); n->m_len += m->m_len; sb->sb_cc += m->m_len; if (m->m_type != MT_DATA && m->m_type != MT_HEADER && m->m_type != MT_OOBDATA) /* XXX: Probably don't need.*/ sb->sb_ctl += m->m_len; m = m_free(m); continue; } if (n) n->m_next = m; else sb->sb_mb = m; sb->sb_mbtail = m; sballoc(sb, m); n = m; m->m_flags &= ~M_EOR; m = m->m_next; n->m_next = 0; } if (eor) { if (n) n->m_flags |= eor; else printf("semi-panic: sbcompress\n"); } SBLASTMBUFCHK(sb); } /* * Free all mbufs in a sockbuf. * Check that all resources are reclaimed. */ void sbflush_locked(sb) register struct sockbuf *sb; { SOCKBUF_LOCK_ASSERT(sb); if (sb->sb_flags & SB_LOCK) panic("sbflush_locked: locked"); while (sb->sb_mbcnt) { /* * Don't call sbdrop(sb, 0) if the leading mbuf is non-empty: * we would loop forever. Panic instead. */ if (!sb->sb_cc && (sb->sb_mb == NULL || sb->sb_mb->m_len)) break; sbdrop_locked(sb, (int)sb->sb_cc); } if (sb->sb_cc || sb->sb_mb || sb->sb_mbcnt) panic("sbflush_locked: cc %u || mb %p || mbcnt %u", sb->sb_cc, (void *)sb->sb_mb, sb->sb_mbcnt); } void sbflush(sb) register struct sockbuf *sb; { SOCKBUF_LOCK(sb); sbflush_locked(sb); SOCKBUF_UNLOCK(sb); } /* * Drop data from (the front of) a sockbuf. */ void sbdrop_locked(sb, len) register struct sockbuf *sb; register int len; { register struct mbuf *m; struct mbuf *next; SOCKBUF_LOCK_ASSERT(sb); next = (m = sb->sb_mb) ? m->m_nextpkt : 0; while (len > 0) { if (m == 0) { if (next == 0) panic("sbdrop"); m = next; next = m->m_nextpkt; continue; } if (m->m_len > len) { m->m_len -= len; m->m_data += len; sb->sb_cc -= len; if (m->m_type != MT_DATA && m->m_type != MT_HEADER && m->m_type != MT_OOBDATA) sb->sb_ctl -= len; break; } len -= m->m_len; sbfree(sb, m); m = m_free(m); } while (m && m->m_len == 0) { sbfree(sb, m); m = m_free(m); } if (m) { sb->sb_mb = m; m->m_nextpkt = next; } else sb->sb_mb = next; /* * First part is an inline SB_EMPTY_FIXUP(). Second part * makes sure sb_lastrecord is up-to-date if we dropped * part of the last record. */ m = sb->sb_mb; if (m == NULL) { sb->sb_mbtail = NULL; sb->sb_lastrecord = NULL; } else if (m->m_nextpkt == NULL) { sb->sb_lastrecord = m; } } /* * Drop data from (the front of) a sockbuf. */ void sbdrop(sb, len) register struct sockbuf *sb; register int len; { SOCKBUF_LOCK(sb); sbdrop_locked(sb, len); SOCKBUF_UNLOCK(sb); } /* * Drop a record off the front of a sockbuf * and move the next record to the front. */ void sbdroprecord_locked(sb) register struct sockbuf *sb; { register struct mbuf *m; SOCKBUF_LOCK_ASSERT(sb); m = sb->sb_mb; if (m) { sb->sb_mb = m->m_nextpkt; do { sbfree(sb, m); m = m_free(m); } while (m); } SB_EMPTY_FIXUP(sb); } /* * Drop a record off the front of a sockbuf * and move the next record to the front. */ void sbdroprecord(sb) register struct sockbuf *sb; { SOCKBUF_LOCK(sb); sbdroprecord_locked(sb); SOCKBUF_UNLOCK(sb); } /* * Create a "control" mbuf containing the specified data * with the specified type for presentation on a socket buffer. */ struct mbuf * sbcreatecontrol(p, size, type, level) caddr_t p; register int size; int type, level; { register struct cmsghdr *cp; struct mbuf *m; if (CMSG_SPACE((u_int)size) > MCLBYTES) return ((struct mbuf *) NULL); if (CMSG_SPACE((u_int)size > MLEN)) m = m_getcl(M_DONTWAIT, MT_CONTROL, 0); else m = m_get(M_DONTWAIT, MT_CONTROL); if (m == NULL) return ((struct mbuf *) NULL); cp = mtod(m, struct cmsghdr *); m->m_len = 0; KASSERT(CMSG_SPACE((u_int)size) <= M_TRAILINGSPACE(m), ("sbcreatecontrol: short mbuf")); if (p != NULL) (void)memcpy(CMSG_DATA(cp), p, size); m->m_len = CMSG_SPACE(size); cp->cmsg_len = CMSG_LEN(size); cp->cmsg_level = level; cp->cmsg_type = type; return (m); } /* * Some routines that return EOPNOTSUPP for entry points that are not * supported by a protocol. Fill in as needed. */ int pru_accept_notsupp(struct socket *so, struct sockaddr **nam) { return EOPNOTSUPP; } int pru_connect_notsupp(struct socket *so, struct sockaddr *nam, struct thread *td) { return EOPNOTSUPP; } int pru_connect2_notsupp(struct socket *so1, struct socket *so2) { return EOPNOTSUPP; } int pru_control_notsupp(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, struct thread *td) { return EOPNOTSUPP; } int pru_listen_notsupp(struct socket *so, struct thread *td) { return EOPNOTSUPP; } int pru_rcvd_notsupp(struct socket *so, int flags) { return EOPNOTSUPP; } int pru_rcvoob_notsupp(struct socket *so, struct mbuf *m, int flags) { return EOPNOTSUPP; } /* * This isn't really a ``null'' operation, but it's the default one * and doesn't do anything destructive. */ int pru_sense_null(struct socket *so, struct stat *sb) { sb->st_blksize = so->so_snd.sb_hiwat; return 0; } /* * For protocol types that don't keep cached copies of labels in their * pcbs, provide a null sosetlabel that does a NOOP. */ void pru_sosetlabel_null(struct socket *so) { } /* * Make a copy of a sockaddr in a malloced buffer of type M_SONAME. */ struct sockaddr * sodupsockaddr(const struct sockaddr *sa, int mflags) { struct sockaddr *sa2; sa2 = malloc(sa->sa_len, M_SONAME, mflags); if (sa2) bcopy(sa, sa2, sa->sa_len); return sa2; } /* * Create an external-format (``xsocket'') structure using the information * in the kernel-format socket structure pointed to by so. This is done * to reduce the spew of irrelevant information over this interface, * to isolate user code from changes in the kernel structure, and * potentially to provide information-hiding if we decide that * some of this information should be hidden from users. */ void sotoxsocket(struct socket *so, struct xsocket *xso) { xso->xso_len = sizeof *xso; xso->xso_so = so; xso->so_type = so->so_type; xso->so_options = so->so_options; xso->so_linger = so->so_linger; xso->so_state = so->so_state; xso->so_pcb = so->so_pcb; xso->xso_protocol = so->so_proto->pr_protocol; xso->xso_family = so->so_proto->pr_domain->dom_family; xso->so_qlen = so->so_qlen; xso->so_incqlen = so->so_incqlen; xso->so_qlimit = so->so_qlimit; xso->so_timeo = so->so_timeo; xso->so_error = so->so_error; xso->so_pgid = so->so_sigio ? so->so_sigio->sio_pgid : 0; xso->so_oobmark = so->so_oobmark; sbtoxsockbuf(&so->so_snd, &xso->so_snd); sbtoxsockbuf(&so->so_rcv, &xso->so_rcv); xso->so_uid = so->so_cred->cr_uid; } /* * This does the same for sockbufs. Note that the xsockbuf structure, * since it is always embedded in a socket, does not include a self * pointer nor a length. We make this entry point public in case * some other mechanism needs it. */ void sbtoxsockbuf(struct sockbuf *sb, struct xsockbuf *xsb) { xsb->sb_cc = sb->sb_cc; xsb->sb_hiwat = sb->sb_hiwat; xsb->sb_mbcnt = sb->sb_mbcnt; xsb->sb_mbmax = sb->sb_mbmax; xsb->sb_lowat = sb->sb_lowat; xsb->sb_flags = sb->sb_flags; xsb->sb_timeo = sb->sb_timeo; } /* * Here is the definition of some of the basic objects in the kern.ipc * branch of the MIB. */ SYSCTL_NODE(_kern, KERN_IPC, ipc, CTLFLAG_RW, 0, "IPC"); /* This takes the place of kern.maxsockbuf, which moved to kern.ipc. */ static int dummy; SYSCTL_INT(_kern, KERN_DUMMY, dummy, CTLFLAG_RW, &dummy, 0, ""); SYSCTL_OID(_kern_ipc, KIPC_MAXSOCKBUF, maxsockbuf, CTLTYPE_ULONG|CTLFLAG_RW, &sb_max, 0, sysctl_handle_sb_max, "LU", "Maximum socket buffer size"); SYSCTL_INT(_kern_ipc, OID_AUTO, maxsockets, CTLFLAG_RDTUN, &maxsockets, 0, "Maximum number of sockets avaliable"); SYSCTL_ULONG(_kern_ipc, KIPC_SOCKBUF_WASTE, sockbuf_waste_factor, CTLFLAG_RW, &sb_efficiency, 0, ""); /* * Initialise maxsockets */ static void init_maxsockets(void *ignored) { TUNABLE_INT_FETCH("kern.ipc.maxsockets", &maxsockets); maxsockets = imax(maxsockets, imax(maxfiles, nmbclusters)); } SYSINIT(param, SI_SUB_TUNABLES, SI_ORDER_ANY, init_maxsockets, NULL); Index: head/sys/kern/uipc_socket2.c =================================================================== --- head/sys/kern/uipc_socket2.c (revision 131150) +++ head/sys/kern/uipc_socket2.c (revision 131151) @@ -1,1432 +1,1429 @@ /* * Copyright (c) 1982, 1986, 1988, 1990, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)uipc_socket2.c 8.1 (Berkeley) 6/10/93 */ #include __FBSDID("$FreeBSD$"); #include "opt_mac.h" #include "opt_param.h" #include #include /* for aio_swake proto */ #include #include #include /* for maxfiles */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include int maxsockets; void (*aio_swake)(struct socket *, struct sockbuf *); /* * Primitive routines for operating on sockets and socket buffers */ u_long sb_max = SB_MAX; static u_long sb_max_adj = SB_MAX * MCLBYTES / (MSIZE + MCLBYTES); /* adjusted sb_max */ static u_long sb_efficiency = 8; /* parameter for sbreserve() */ /* * Procedures to manipulate state flags of socket * and do appropriate wakeups. Normal sequence from the * active (originating) side is that soisconnecting() is * called during processing of connect() call, * resulting in an eventual call to soisconnected() if/when the * connection is established. When the connection is torn down * soisdisconnecting() is called during processing of disconnect() call, * and soisdisconnected() is called when the connection to the peer * is totally severed. The semantics of these routines are such that * connectionless protocols can call soisconnected() and soisdisconnected() * only, bypassing the in-progress calls when setting up a ``connection'' * takes no time. * * From the passive side, a socket is created with * two queues of sockets: so_incomp for connections in progress * and so_comp for connections already made and awaiting user acceptance. * As a protocol is preparing incoming connections, it creates a socket * structure queued on so_incomp by calling sonewconn(). When the connection * is established, soisconnected() is called, and transfers the * socket structure to so_comp, making it available to accept(). * * If a socket is closed with sockets on either * so_incomp or so_comp, these sockets are dropped. * * If higher level protocols are implemented in * the kernel, the wakeups done here will sometimes * cause software-interrupt process scheduling. */ void soisconnecting(so) register struct socket *so; { SOCK_LOCK(so); so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING); so->so_state |= SS_ISCONNECTING; SOCK_UNLOCK(so); } void soisconnected(so) struct socket *so; { struct socket *head; SOCK_LOCK(so); so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING|SS_ISCONFIRMING); so->so_state |= SS_ISCONNECTED; SOCK_UNLOCK(so); ACCEPT_LOCK(); head = so->so_head; if (head != NULL && (so->so_qstate & SQ_INCOMP)) { if ((so->so_options & SO_ACCEPTFILTER) == 0) { TAILQ_REMOVE(&head->so_incomp, so, so_list); head->so_incqlen--; so->so_qstate &= ~SQ_INCOMP; TAILQ_INSERT_TAIL(&head->so_comp, so, so_list); head->so_qlen++; so->so_qstate |= SQ_COMP; ACCEPT_UNLOCK(); sorwakeup(head); wakeup_one(&head->so_timeo); } else { ACCEPT_UNLOCK(); SOCK_LOCK(so); so->so_upcall = head->so_accf->so_accept_filter->accf_callback; so->so_upcallarg = head->so_accf->so_accept_filter_arg; so->so_rcv.sb_flags |= SB_UPCALL; so->so_options &= ~SO_ACCEPTFILTER; SOCK_UNLOCK(so); so->so_upcall(so, so->so_upcallarg, M_TRYWAIT); } return; } ACCEPT_UNLOCK(); wakeup(&so->so_timeo); sorwakeup(so); sowwakeup(so); } void soisdisconnecting(so) register struct socket *so; { /* * XXXRW: This code separately acquires SOCK_LOCK(so) and * SOCKBUF_LOCK(&so->so_rcv) even though they are the same mutex to * avoid introducing the assumption that they are the same. */ SOCK_LOCK(so); so->so_state &= ~SS_ISCONNECTING; so->so_state |= SS_ISDISCONNECTING; SOCK_UNLOCK(so); SOCKBUF_LOCK(&so->so_rcv); so->so_rcv.sb_state |= SBS_CANTRCVMORE; - SOCKBUF_UNLOCK(&so->so_rcv); + sorwakeup_locked(so); SOCKBUF_LOCK(&so->so_snd); so->so_snd.sb_state |= SBS_CANTSENDMORE; - SOCKBUF_UNLOCK(&so->so_snd); + sowwakeup_locked(so); wakeup(&so->so_timeo); - sowwakeup(so); - sorwakeup(so); } void soisdisconnected(so) register struct socket *so; { /* * XXXRW: This code separately acquires SOCK_LOCK(so) and * SOCKBUF_LOCK(&so->so_rcv) even though they are the same mutex to * avoid introducing the assumption that they are the same. */ + /* XXXRW: so_state locking? */ SOCK_LOCK(so); so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING); so->so_state |= SS_ISDISCONNECTED; SOCK_UNLOCK(so); SOCKBUF_LOCK(&so->so_rcv); so->so_rcv.sb_state |= SBS_CANTRCVMORE; - SOCKBUF_UNLOCK(&so->so_rcv); + sorwakeup_locked(so); SOCKBUF_LOCK(&so->so_snd); so->so_snd.sb_state |= SBS_CANTSENDMORE; sbdrop_locked(&so->so_snd, so->so_snd.sb_cc); - SOCKBUF_UNLOCK(&so->so_snd); + sowwakeup_locked(so); wakeup(&so->so_timeo); - sowwakeup(so); - sorwakeup(so); } /* * When an attempt at a new connection is noted on a socket * which accepts connections, sonewconn is called. If the * connection is possible (subject to space constraints, etc.) * then we allocate a new structure, propoerly linked into the * data structure of the original socket, and return this. * Connstatus may be 0, or SO_ISCONFIRMING, or SO_ISCONNECTED. * * note: the ref count on the socket is 0 on return */ struct socket * sonewconn(head, connstatus) register struct socket *head; int connstatus; { register struct socket *so; int over; ACCEPT_LOCK(); over = (head->so_qlen > 3 * head->so_qlimit / 2); ACCEPT_UNLOCK(); if (over) return ((struct socket *)0); so = soalloc(M_NOWAIT); if (so == NULL) return ((struct socket *)0); if ((head->so_options & SO_ACCEPTFILTER) != 0) connstatus = 0; so->so_head = head; so->so_type = head->so_type; so->so_options = head->so_options &~ SO_ACCEPTCONN; so->so_linger = head->so_linger; so->so_state = head->so_state | SS_NOFDREF; so->so_proto = head->so_proto; so->so_timeo = head->so_timeo; so->so_cred = crhold(head->so_cred); #ifdef MAC SOCK_LOCK(head); mac_create_socket_from_socket(head, so); SOCK_UNLOCK(head); #endif if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat) || (*so->so_proto->pr_usrreqs->pru_attach)(so, 0, NULL)) { sodealloc(so); return ((struct socket *)0); } ACCEPT_LOCK(); if (connstatus) { TAILQ_INSERT_TAIL(&head->so_comp, so, so_list); so->so_qstate |= SQ_COMP; head->so_qlen++; } else { /* * XXXRW: Keep removing sockets from the head until there's * room for us to insert on the tail. In pre-locking * revisions, this was a simple if(), but as we could be * racing with other threads and soabort() requires dropping * locks, we must loop waiting for the condition to be true. */ while (head->so_incqlen > head->so_qlimit) { struct socket *sp; sp = TAILQ_FIRST(&head->so_incomp); TAILQ_REMOVE(&so->so_incomp, sp, so_list); head->so_incqlen--; sp->so_qstate &= ~SQ_INCOMP; sp->so_head = NULL; ACCEPT_UNLOCK(); (void) soabort(sp); ACCEPT_LOCK(); } TAILQ_INSERT_TAIL(&head->so_incomp, so, so_list); so->so_qstate |= SQ_INCOMP; head->so_incqlen++; } ACCEPT_UNLOCK(); if (connstatus) { so->so_state |= connstatus; sorwakeup(head); wakeup_one(&head->so_timeo); } return (so); } /* * Socantsendmore indicates that no more data will be sent on the * socket; it would normally be applied to a socket when the user * informs the system that no more data is to be sent, by the protocol * code (in case PRU_SHUTDOWN). Socantrcvmore indicates that no more data * will be received, and will normally be applied to the socket by a * protocol when it detects that the peer will send no more data. * Data queued for reading in the socket may yet be read. */ void socantsendmore_locked(so) struct socket *so; { SOCKBUF_LOCK_ASSERT(&so->so_snd); so->so_snd.sb_state |= SBS_CANTSENDMORE; sowwakeup_locked(so); mtx_assert(SOCKBUF_MTX(&so->so_snd), MA_NOTOWNED); } void socantsendmore(so) struct socket *so; { SOCKBUF_LOCK(&so->so_snd); socantsendmore_locked(so); mtx_assert(SOCKBUF_MTX(&so->so_snd), MA_NOTOWNED); } void socantrcvmore_locked(so) struct socket *so; { SOCKBUF_LOCK_ASSERT(&so->so_rcv); so->so_rcv.sb_state |= SBS_CANTRCVMORE; sorwakeup_locked(so); mtx_assert(SOCKBUF_MTX(&so->so_rcv), MA_NOTOWNED); } void socantrcvmore(so) struct socket *so; { SOCKBUF_LOCK(&so->so_rcv); socantrcvmore_locked(so); mtx_assert(SOCKBUF_MTX(&so->so_rcv), MA_NOTOWNED); } /* * Wait for data to arrive at/drain from a socket buffer. */ int sbwait(sb) struct sockbuf *sb; { SOCKBUF_LOCK_ASSERT(sb); sb->sb_flags |= SB_WAIT; return (msleep(&sb->sb_cc, &sb->sb_mtx, (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK | PCATCH, "sbwait", sb->sb_timeo)); } /* * Lock a sockbuf already known to be locked; * return any error returned from sleep (EINTR). */ int sb_lock(sb) register struct sockbuf *sb; { int error; SOCKBUF_LOCK_ASSERT(sb); while (sb->sb_flags & SB_LOCK) { sb->sb_flags |= SB_WANT; error = msleep(&sb->sb_flags, &sb->sb_mtx, (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK|PCATCH, "sblock", 0); if (error) return (error); } sb->sb_flags |= SB_LOCK; return (0); } /* * Wakeup processes waiting on a socket buffer. Do asynchronous * notification via SIGIO if the socket has the SS_ASYNC flag set. * * Called with the socket buffer lock held; will release the lock by the end * of the function. This allows the caller to acquire the socket buffer lock * while testing for the need for various sorts of wakeup and hold it through * to the point where it's no longer required. We currently hold the lock * through calls out to other subsystems (with the exception of kqueue), and * then release it to avoid lock order issues. It's not clear that's * correct. */ void sowakeup(so, sb) register struct socket *so; register struct sockbuf *sb; { SOCKBUF_LOCK_ASSERT(sb); selwakeuppri(&sb->sb_sel, PSOCK); sb->sb_flags &= ~SB_SEL; if (sb->sb_flags & SB_WAIT) { sb->sb_flags &= ~SB_WAIT; wakeup(&sb->sb_cc); } KNOTE(&sb->sb_sel.si_note, 0); SOCKBUF_UNLOCK(sb); if ((so->so_state & SS_ASYNC) && so->so_sigio != NULL) pgsigio(&so->so_sigio, SIGIO, 0); if (sb->sb_flags & SB_UPCALL) (*so->so_upcall)(so, so->so_upcallarg, M_DONTWAIT); if (sb->sb_flags & SB_AIO) aio_swake(so, sb); mtx_assert(SOCKBUF_MTX(sb), MA_NOTOWNED); } /* * Socket buffer (struct sockbuf) utility routines. * * Each socket contains two socket buffers: one for sending data and * one for receiving data. Each buffer contains a queue of mbufs, * information about the number of mbufs and amount of data in the * queue, and other fields allowing select() statements and notification * on data availability to be implemented. * * Data stored in a socket buffer is maintained as a list of records. * Each record is a list of mbufs chained together with the m_next * field. Records are chained together with the m_nextpkt field. The upper * level routine soreceive() expects the following conventions to be * observed when placing information in the receive buffer: * * 1. If the protocol requires each message be preceded by the sender's * name, then a record containing that name must be present before * any associated data (mbuf's must be of type MT_SONAME). * 2. If the protocol supports the exchange of ``access rights'' (really * just additional data associated with the message), and there are * ``rights'' to be received, then a record containing this data * should be present (mbuf's must be of type MT_RIGHTS). * 3. If a name or rights record exists, then it must be followed by * a data record, perhaps of zero length. * * Before using a new socket structure it is first necessary to reserve * buffer space to the socket, by calling sbreserve(). This should commit * some of the available buffer space in the system buffer pool for the * socket (currently, it does nothing but enforce limits). The space * should be released by calling sbrelease() when the socket is destroyed. */ int soreserve(so, sndcc, rcvcc) register struct socket *so; u_long sndcc, rcvcc; { struct thread *td = curthread; SOCKBUF_LOCK(&so->so_snd); SOCKBUF_LOCK(&so->so_rcv); if (sbreserve_locked(&so->so_snd, sndcc, so, td) == 0) goto bad; if (sbreserve_locked(&so->so_rcv, rcvcc, so, td) == 0) goto bad2; if (so->so_rcv.sb_lowat == 0) so->so_rcv.sb_lowat = 1; if (so->so_snd.sb_lowat == 0) so->so_snd.sb_lowat = MCLBYTES; if (so->so_snd.sb_lowat > so->so_snd.sb_hiwat) so->so_snd.sb_lowat = so->so_snd.sb_hiwat; SOCKBUF_UNLOCK(&so->so_rcv); SOCKBUF_UNLOCK(&so->so_snd); return (0); bad2: sbrelease_locked(&so->so_snd, so); bad: SOCKBUF_UNLOCK(&so->so_rcv); SOCKBUF_UNLOCK(&so->so_snd); return (ENOBUFS); } static int sysctl_handle_sb_max(SYSCTL_HANDLER_ARGS) { int error = 0; u_long old_sb_max = sb_max; error = SYSCTL_OUT(req, arg1, sizeof(u_long)); if (error || !req->newptr) return (error); error = SYSCTL_IN(req, arg1, sizeof(u_long)); if (error) return (error); if (sb_max < MSIZE + MCLBYTES) { sb_max = old_sb_max; return (EINVAL); } sb_max_adj = (u_quad_t)sb_max * MCLBYTES / (MSIZE + MCLBYTES); return (0); } /* * Allot mbufs to a sockbuf. * Attempt to scale mbmax so that mbcnt doesn't become limiting * if buffering efficiency is near the normal case. */ int sbreserve_locked(sb, cc, so, td) struct sockbuf *sb; u_long cc; struct socket *so; struct thread *td; { rlim_t sbsize_limit; SOCKBUF_LOCK_ASSERT(sb); /* * td will only be NULL when we're in an interrupt * (e.g. in tcp_input()) */ if (cc > sb_max_adj) return (0); if (td != NULL) { PROC_LOCK(td->td_proc); sbsize_limit = lim_cur(td->td_proc, RLIMIT_SBSIZE); PROC_UNLOCK(td->td_proc); } else sbsize_limit = RLIM_INFINITY; if (!chgsbsize(so->so_cred->cr_uidinfo, &sb->sb_hiwat, cc, sbsize_limit)) return (0); sb->sb_mbmax = min(cc * sb_efficiency, sb_max); if (sb->sb_lowat > sb->sb_hiwat) sb->sb_lowat = sb->sb_hiwat; return (1); } int sbreserve(sb, cc, so, td) struct sockbuf *sb; u_long cc; struct socket *so; struct thread *td; { int error; SOCKBUF_LOCK(sb); error = sbreserve_locked(sb, cc, so, td); SOCKBUF_UNLOCK(sb); return (error); } /* * Free mbufs held by a socket, and reserved mbuf space. */ void sbrelease_locked(sb, so) struct sockbuf *sb; struct socket *so; { SOCKBUF_LOCK_ASSERT(sb); sbflush_locked(sb); (void)chgsbsize(so->so_cred->cr_uidinfo, &sb->sb_hiwat, 0, RLIM_INFINITY); sb->sb_mbmax = 0; } void sbrelease(sb, so) struct sockbuf *sb; struct socket *so; { SOCKBUF_LOCK(sb); sbrelease_locked(sb, so); SOCKBUF_UNLOCK(sb); } /* * Routines to add and remove * data from an mbuf queue. * * The routines sbappend() or sbappendrecord() are normally called to * append new mbufs to a socket buffer, after checking that adequate * space is available, comparing the function sbspace() with the amount * of data to be added. sbappendrecord() differs from sbappend() in * that data supplied is treated as the beginning of a new record. * To place a sender's address, optional access rights, and data in a * socket receive buffer, sbappendaddr() should be used. To place * access rights and data in a socket receive buffer, sbappendrights() * should be used. In either case, the new data begins a new record. * Note that unlike sbappend() and sbappendrecord(), these routines check * for the caller that there will be enough space to store the data. * Each fails if there is not enough space, or if it cannot find mbufs * to store additional information in. * * Reliable protocols may use the socket send buffer to hold data * awaiting acknowledgement. Data is normally copied from a socket * send buffer in a protocol with m_copy for output to a peer, * and then removing the data from the socket buffer with sbdrop() * or sbdroprecord() when the data is acknowledged by the peer. */ #ifdef SOCKBUF_DEBUG void sblastrecordchk(struct sockbuf *sb, const char *file, int line) { struct mbuf *m = sb->sb_mb; SOCKBUF_LOCK_ASSERT(sb); while (m && m->m_nextpkt) m = m->m_nextpkt; if (m != sb->sb_lastrecord) { printf("%s: sb_mb %p sb_lastrecord %p last %p\n", __func__, sb->sb_mb, sb->sb_lastrecord, m); printf("packet chain:\n"); for (m = sb->sb_mb; m != NULL; m = m->m_nextpkt) printf("\t%p\n", m); panic("%s from %s:%u", __func__, file, line); } } void sblastmbufchk(struct sockbuf *sb, const char *file, int line) { struct mbuf *m = sb->sb_mb; struct mbuf *n; SOCKBUF_LOCK_ASSERT(sb); while (m && m->m_nextpkt) m = m->m_nextpkt; while (m && m->m_next) m = m->m_next; if (m != sb->sb_mbtail) { printf("%s: sb_mb %p sb_mbtail %p last %p\n", __func__, sb->sb_mb, sb->sb_mbtail, m); printf("packet tree:\n"); for (m = sb->sb_mb; m != NULL; m = m->m_nextpkt) { printf("\t"); for (n = m; n != NULL; n = n->m_next) printf("%p ", n); printf("\n"); } panic("%s from %s:%u", __func__, file, line); } } #endif /* SOCKBUF_DEBUG */ #define SBLINKRECORD(sb, m0) do { \ SOCKBUF_LOCK_ASSERT(sb); \ if ((sb)->sb_lastrecord != NULL) \ (sb)->sb_lastrecord->m_nextpkt = (m0); \ else \ (sb)->sb_mb = (m0); \ (sb)->sb_lastrecord = (m0); \ } while (/*CONSTCOND*/0) /* * Append mbuf chain m to the last record in the * socket buffer sb. The additional space associated * the mbuf chain is recorded in sb. Empty mbufs are * discarded and mbufs are compacted where possible. */ void sbappend_locked(sb, m) struct sockbuf *sb; struct mbuf *m; { register struct mbuf *n; SOCKBUF_LOCK_ASSERT(sb); if (m == 0) return; SBLASTRECORDCHK(sb); n = sb->sb_mb; if (n) { while (n->m_nextpkt) n = n->m_nextpkt; do { if (n->m_flags & M_EOR) { sbappendrecord_locked(sb, m); /* XXXXXX!!!! */ return; } } while (n->m_next && (n = n->m_next)); } else { /* * XXX Would like to simply use sb_mbtail here, but * XXX I need to verify that I won't miss an EOR that * XXX way. */ if ((n = sb->sb_lastrecord) != NULL) { do { if (n->m_flags & M_EOR) { sbappendrecord_locked(sb, m); /* XXXXXX!!!! */ return; } } while (n->m_next && (n = n->m_next)); } else { /* * If this is the first record in the socket buffer, * it's also the last record. */ sb->sb_lastrecord = m; } } sbcompress(sb, m, n); SBLASTRECORDCHK(sb); } /* * Append mbuf chain m to the last record in the * socket buffer sb. The additional space associated * the mbuf chain is recorded in sb. Empty mbufs are * discarded and mbufs are compacted where possible. */ void sbappend(sb, m) struct sockbuf *sb; struct mbuf *m; { SOCKBUF_LOCK(sb); sbappend_locked(sb, m); SOCKBUF_UNLOCK(sb); } /* * This version of sbappend() should only be used when the caller * absolutely knows that there will never be more than one record * in the socket buffer, that is, a stream protocol (such as TCP). */ void sbappendstream_locked(struct sockbuf *sb, struct mbuf *m) { SOCKBUF_LOCK_ASSERT(sb); KASSERT(m->m_nextpkt == NULL,("sbappendstream 0")); KASSERT(sb->sb_mb == sb->sb_lastrecord,("sbappendstream 1")); SBLASTMBUFCHK(sb); sbcompress(sb, m, sb->sb_mbtail); sb->sb_lastrecord = sb->sb_mb; SBLASTRECORDCHK(sb); } /* * This version of sbappend() should only be used when the caller * absolutely knows that there will never be more than one record * in the socket buffer, that is, a stream protocol (such as TCP). */ void sbappendstream(struct sockbuf *sb, struct mbuf *m) { SOCKBUF_LOCK(sb); sbappendstream_locked(sb, m); SOCKBUF_UNLOCK(sb); } #ifdef SOCKBUF_DEBUG void sbcheck(sb) struct sockbuf *sb; { struct mbuf *m; struct mbuf *n = 0; u_long len = 0, mbcnt = 0; SOCKBUF_LOCK_ASSERT(sb); for (m = sb->sb_mb; m; m = n) { n = m->m_nextpkt; for (; m; m = m->m_next) { len += m->m_len; mbcnt += MSIZE; if (m->m_flags & M_EXT) /*XXX*/ /* pretty sure this is bogus */ mbcnt += m->m_ext.ext_size; } } if (len != sb->sb_cc || mbcnt != sb->sb_mbcnt) { printf("cc %ld != %u || mbcnt %ld != %u\n", len, sb->sb_cc, mbcnt, sb->sb_mbcnt); panic("sbcheck"); } } #endif /* * As above, except the mbuf chain * begins a new record. */ void sbappendrecord_locked(sb, m0) register struct sockbuf *sb; register struct mbuf *m0; { register struct mbuf *m; SOCKBUF_LOCK_ASSERT(sb); if (m0 == 0) return; m = sb->sb_mb; if (m) while (m->m_nextpkt) m = m->m_nextpkt; /* * Put the first mbuf on the queue. * Note this permits zero length records. */ sballoc(sb, m0); SBLASTRECORDCHK(sb); SBLINKRECORD(sb, m0); if (m) m->m_nextpkt = m0; else sb->sb_mb = m0; m = m0->m_next; m0->m_next = 0; if (m && (m0->m_flags & M_EOR)) { m0->m_flags &= ~M_EOR; m->m_flags |= M_EOR; } sbcompress(sb, m, m0); } /* * As above, except the mbuf chain * begins a new record. */ void sbappendrecord(sb, m0) register struct sockbuf *sb; register struct mbuf *m0; { SOCKBUF_LOCK(sb); sbappendrecord_locked(sb, m0); SOCKBUF_UNLOCK(sb); } /* * As above except that OOB data * is inserted at the beginning of the sockbuf, * but after any other OOB data. */ void sbinsertoob_locked(sb, m0) register struct sockbuf *sb; register struct mbuf *m0; { register struct mbuf *m; register struct mbuf **mp; SOCKBUF_LOCK_ASSERT(sb); if (m0 == 0) return; for (mp = &sb->sb_mb; *mp ; mp = &((*mp)->m_nextpkt)) { m = *mp; again: switch (m->m_type) { case MT_OOBDATA: continue; /* WANT next train */ case MT_CONTROL: m = m->m_next; if (m) goto again; /* inspect THIS train further */ } break; } /* * Put the first mbuf on the queue. * Note this permits zero length records. */ sballoc(sb, m0); m0->m_nextpkt = *mp; *mp = m0; m = m0->m_next; m0->m_next = 0; if (m && (m0->m_flags & M_EOR)) { m0->m_flags &= ~M_EOR; m->m_flags |= M_EOR; } sbcompress(sb, m, m0); } /* * As above except that OOB data * is inserted at the beginning of the sockbuf, * but after any other OOB data. */ void sbinsertoob(sb, m0) register struct sockbuf *sb; register struct mbuf *m0; { SOCKBUF_LOCK(sb); sbinsertoob_locked(sb, m0); SOCKBUF_UNLOCK(sb); } /* * Append address and data, and optionally, control (ancillary) data * to the receive queue of a socket. If present, * m0 must include a packet header with total length. * Returns 0 if no space in sockbuf or insufficient mbufs. */ int sbappendaddr_locked(sb, asa, m0, control) struct sockbuf *sb; const struct sockaddr *asa; struct mbuf *m0, *control; { struct mbuf *m, *n, *nlast; int space = asa->sa_len; SOCKBUF_LOCK_ASSERT(sb); if (m0 && (m0->m_flags & M_PKTHDR) == 0) panic("sbappendaddr_locked"); if (m0) space += m0->m_pkthdr.len; space += m_length(control, &n); if (space > sbspace(sb)) return (0); #if MSIZE <= 256 if (asa->sa_len > MLEN) return (0); #endif MGET(m, M_DONTWAIT, MT_SONAME); if (m == 0) return (0); m->m_len = asa->sa_len; bcopy(asa, mtod(m, caddr_t), asa->sa_len); if (n) n->m_next = m0; /* concatenate data to control */ else control = m0; m->m_next = control; for (n = m; n->m_next != NULL; n = n->m_next) sballoc(sb, n); sballoc(sb, n); nlast = n; SBLINKRECORD(sb, m); sb->sb_mbtail = nlast; SBLASTMBUFCHK(sb); SBLASTRECORDCHK(sb); return (1); } /* * Append address and data, and optionally, control (ancillary) data * to the receive queue of a socket. If present, * m0 must include a packet header with total length. * Returns 0 if no space in sockbuf or insufficient mbufs. */ int sbappendaddr(sb, asa, m0, control) struct sockbuf *sb; const struct sockaddr *asa; struct mbuf *m0, *control; { int retval; SOCKBUF_LOCK(sb); retval = sbappendaddr_locked(sb, asa, m0, control); SOCKBUF_UNLOCK(sb); return (retval); } int sbappendcontrol_locked(sb, m0, control) struct sockbuf *sb; struct mbuf *control, *m0; { struct mbuf *m, *n, *mlast; int space; SOCKBUF_LOCK_ASSERT(sb); if (control == 0) panic("sbappendcontrol_locked"); space = m_length(control, &n) + m_length(m0, NULL); if (space > sbspace(sb)) return (0); n->m_next = m0; /* concatenate data to control */ SBLASTRECORDCHK(sb); for (m = control; m->m_next; m = m->m_next) sballoc(sb, m); sballoc(sb, m); mlast = m; SBLINKRECORD(sb, control); sb->sb_mbtail = mlast; SBLASTMBUFCHK(sb); SBLASTRECORDCHK(sb); return (1); } int sbappendcontrol(sb, m0, control) struct sockbuf *sb; struct mbuf *control, *m0; { int retval; SOCKBUF_LOCK(sb); retval = sbappendcontrol_locked(sb, m0, control); SOCKBUF_UNLOCK(sb); return (retval); } /* * Compress mbuf chain m into the socket * buffer sb following mbuf n. If n * is null, the buffer is presumed empty. */ void sbcompress(sb, m, n) register struct sockbuf *sb; register struct mbuf *m, *n; { register int eor = 0; register struct mbuf *o; SOCKBUF_LOCK_ASSERT(sb); while (m) { eor |= m->m_flags & M_EOR; if (m->m_len == 0 && (eor == 0 || (((o = m->m_next) || (o = n)) && o->m_type == m->m_type))) { if (sb->sb_lastrecord == m) sb->sb_lastrecord = m->m_next; m = m_free(m); continue; } if (n && (n->m_flags & M_EOR) == 0 && M_WRITABLE(n) && m->m_len <= MCLBYTES / 4 && /* XXX: Don't copy too much */ m->m_len <= M_TRAILINGSPACE(n) && n->m_type == m->m_type) { bcopy(mtod(m, caddr_t), mtod(n, caddr_t) + n->m_len, (unsigned)m->m_len); n->m_len += m->m_len; sb->sb_cc += m->m_len; if (m->m_type != MT_DATA && m->m_type != MT_HEADER && m->m_type != MT_OOBDATA) /* XXX: Probably don't need.*/ sb->sb_ctl += m->m_len; m = m_free(m); continue; } if (n) n->m_next = m; else sb->sb_mb = m; sb->sb_mbtail = m; sballoc(sb, m); n = m; m->m_flags &= ~M_EOR; m = m->m_next; n->m_next = 0; } if (eor) { if (n) n->m_flags |= eor; else printf("semi-panic: sbcompress\n"); } SBLASTMBUFCHK(sb); } /* * Free all mbufs in a sockbuf. * Check that all resources are reclaimed. */ void sbflush_locked(sb) register struct sockbuf *sb; { SOCKBUF_LOCK_ASSERT(sb); if (sb->sb_flags & SB_LOCK) panic("sbflush_locked: locked"); while (sb->sb_mbcnt) { /* * Don't call sbdrop(sb, 0) if the leading mbuf is non-empty: * we would loop forever. Panic instead. */ if (!sb->sb_cc && (sb->sb_mb == NULL || sb->sb_mb->m_len)) break; sbdrop_locked(sb, (int)sb->sb_cc); } if (sb->sb_cc || sb->sb_mb || sb->sb_mbcnt) panic("sbflush_locked: cc %u || mb %p || mbcnt %u", sb->sb_cc, (void *)sb->sb_mb, sb->sb_mbcnt); } void sbflush(sb) register struct sockbuf *sb; { SOCKBUF_LOCK(sb); sbflush_locked(sb); SOCKBUF_UNLOCK(sb); } /* * Drop data from (the front of) a sockbuf. */ void sbdrop_locked(sb, len) register struct sockbuf *sb; register int len; { register struct mbuf *m; struct mbuf *next; SOCKBUF_LOCK_ASSERT(sb); next = (m = sb->sb_mb) ? m->m_nextpkt : 0; while (len > 0) { if (m == 0) { if (next == 0) panic("sbdrop"); m = next; next = m->m_nextpkt; continue; } if (m->m_len > len) { m->m_len -= len; m->m_data += len; sb->sb_cc -= len; if (m->m_type != MT_DATA && m->m_type != MT_HEADER && m->m_type != MT_OOBDATA) sb->sb_ctl -= len; break; } len -= m->m_len; sbfree(sb, m); m = m_free(m); } while (m && m->m_len == 0) { sbfree(sb, m); m = m_free(m); } if (m) { sb->sb_mb = m; m->m_nextpkt = next; } else sb->sb_mb = next; /* * First part is an inline SB_EMPTY_FIXUP(). Second part * makes sure sb_lastrecord is up-to-date if we dropped * part of the last record. */ m = sb->sb_mb; if (m == NULL) { sb->sb_mbtail = NULL; sb->sb_lastrecord = NULL; } else if (m->m_nextpkt == NULL) { sb->sb_lastrecord = m; } } /* * Drop data from (the front of) a sockbuf. */ void sbdrop(sb, len) register struct sockbuf *sb; register int len; { SOCKBUF_LOCK(sb); sbdrop_locked(sb, len); SOCKBUF_UNLOCK(sb); } /* * Drop a record off the front of a sockbuf * and move the next record to the front. */ void sbdroprecord_locked(sb) register struct sockbuf *sb; { register struct mbuf *m; SOCKBUF_LOCK_ASSERT(sb); m = sb->sb_mb; if (m) { sb->sb_mb = m->m_nextpkt; do { sbfree(sb, m); m = m_free(m); } while (m); } SB_EMPTY_FIXUP(sb); } /* * Drop a record off the front of a sockbuf * and move the next record to the front. */ void sbdroprecord(sb) register struct sockbuf *sb; { SOCKBUF_LOCK(sb); sbdroprecord_locked(sb); SOCKBUF_UNLOCK(sb); } /* * Create a "control" mbuf containing the specified data * with the specified type for presentation on a socket buffer. */ struct mbuf * sbcreatecontrol(p, size, type, level) caddr_t p; register int size; int type, level; { register struct cmsghdr *cp; struct mbuf *m; if (CMSG_SPACE((u_int)size) > MCLBYTES) return ((struct mbuf *) NULL); if (CMSG_SPACE((u_int)size > MLEN)) m = m_getcl(M_DONTWAIT, MT_CONTROL, 0); else m = m_get(M_DONTWAIT, MT_CONTROL); if (m == NULL) return ((struct mbuf *) NULL); cp = mtod(m, struct cmsghdr *); m->m_len = 0; KASSERT(CMSG_SPACE((u_int)size) <= M_TRAILINGSPACE(m), ("sbcreatecontrol: short mbuf")); if (p != NULL) (void)memcpy(CMSG_DATA(cp), p, size); m->m_len = CMSG_SPACE(size); cp->cmsg_len = CMSG_LEN(size); cp->cmsg_level = level; cp->cmsg_type = type; return (m); } /* * Some routines that return EOPNOTSUPP for entry points that are not * supported by a protocol. Fill in as needed. */ int pru_accept_notsupp(struct socket *so, struct sockaddr **nam) { return EOPNOTSUPP; } int pru_connect_notsupp(struct socket *so, struct sockaddr *nam, struct thread *td) { return EOPNOTSUPP; } int pru_connect2_notsupp(struct socket *so1, struct socket *so2) { return EOPNOTSUPP; } int pru_control_notsupp(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, struct thread *td) { return EOPNOTSUPP; } int pru_listen_notsupp(struct socket *so, struct thread *td) { return EOPNOTSUPP; } int pru_rcvd_notsupp(struct socket *so, int flags) { return EOPNOTSUPP; } int pru_rcvoob_notsupp(struct socket *so, struct mbuf *m, int flags) { return EOPNOTSUPP; } /* * This isn't really a ``null'' operation, but it's the default one * and doesn't do anything destructive. */ int pru_sense_null(struct socket *so, struct stat *sb) { sb->st_blksize = so->so_snd.sb_hiwat; return 0; } /* * For protocol types that don't keep cached copies of labels in their * pcbs, provide a null sosetlabel that does a NOOP. */ void pru_sosetlabel_null(struct socket *so) { } /* * Make a copy of a sockaddr in a malloced buffer of type M_SONAME. */ struct sockaddr * sodupsockaddr(const struct sockaddr *sa, int mflags) { struct sockaddr *sa2; sa2 = malloc(sa->sa_len, M_SONAME, mflags); if (sa2) bcopy(sa, sa2, sa->sa_len); return sa2; } /* * Create an external-format (``xsocket'') structure using the information * in the kernel-format socket structure pointed to by so. This is done * to reduce the spew of irrelevant information over this interface, * to isolate user code from changes in the kernel structure, and * potentially to provide information-hiding if we decide that * some of this information should be hidden from users. */ void sotoxsocket(struct socket *so, struct xsocket *xso) { xso->xso_len = sizeof *xso; xso->xso_so = so; xso->so_type = so->so_type; xso->so_options = so->so_options; xso->so_linger = so->so_linger; xso->so_state = so->so_state; xso->so_pcb = so->so_pcb; xso->xso_protocol = so->so_proto->pr_protocol; xso->xso_family = so->so_proto->pr_domain->dom_family; xso->so_qlen = so->so_qlen; xso->so_incqlen = so->so_incqlen; xso->so_qlimit = so->so_qlimit; xso->so_timeo = so->so_timeo; xso->so_error = so->so_error; xso->so_pgid = so->so_sigio ? so->so_sigio->sio_pgid : 0; xso->so_oobmark = so->so_oobmark; sbtoxsockbuf(&so->so_snd, &xso->so_snd); sbtoxsockbuf(&so->so_rcv, &xso->so_rcv); xso->so_uid = so->so_cred->cr_uid; } /* * This does the same for sockbufs. Note that the xsockbuf structure, * since it is always embedded in a socket, does not include a self * pointer nor a length. We make this entry point public in case * some other mechanism needs it. */ void sbtoxsockbuf(struct sockbuf *sb, struct xsockbuf *xsb) { xsb->sb_cc = sb->sb_cc; xsb->sb_hiwat = sb->sb_hiwat; xsb->sb_mbcnt = sb->sb_mbcnt; xsb->sb_mbmax = sb->sb_mbmax; xsb->sb_lowat = sb->sb_lowat; xsb->sb_flags = sb->sb_flags; xsb->sb_timeo = sb->sb_timeo; } /* * Here is the definition of some of the basic objects in the kern.ipc * branch of the MIB. */ SYSCTL_NODE(_kern, KERN_IPC, ipc, CTLFLAG_RW, 0, "IPC"); /* This takes the place of kern.maxsockbuf, which moved to kern.ipc. */ static int dummy; SYSCTL_INT(_kern, KERN_DUMMY, dummy, CTLFLAG_RW, &dummy, 0, ""); SYSCTL_OID(_kern_ipc, KIPC_MAXSOCKBUF, maxsockbuf, CTLTYPE_ULONG|CTLFLAG_RW, &sb_max, 0, sysctl_handle_sb_max, "LU", "Maximum socket buffer size"); SYSCTL_INT(_kern_ipc, OID_AUTO, maxsockets, CTLFLAG_RDTUN, &maxsockets, 0, "Maximum number of sockets avaliable"); SYSCTL_ULONG(_kern_ipc, KIPC_SOCKBUF_WASTE, sockbuf_waste_factor, CTLFLAG_RW, &sb_efficiency, 0, ""); /* * Initialise maxsockets */ static void init_maxsockets(void *ignored) { TUNABLE_INT_FETCH("kern.ipc.maxsockets", &maxsockets); maxsockets = imax(maxsockets, imax(maxfiles, nmbclusters)); } SYSINIT(param, SI_SUB_TUNABLES, SI_ORDER_ANY, init_maxsockets, NULL); Index: head/sys/kern/uipc_usrreq.c =================================================================== --- head/sys/kern/uipc_usrreq.c (revision 131150) +++ head/sys/kern/uipc_usrreq.c (revision 131151) @@ -1,1682 +1,1679 @@ /* * Copyright (c) 1982, 1986, 1989, 1991, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * From: @(#)uipc_usrreq.c 8.3 (Berkeley) 1/4/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_mac.h" #include #include #include #include /* XXX must be before */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static uma_zone_t unp_zone; static unp_gen_t unp_gencnt; static u_int unp_count; static struct unp_head unp_shead, unp_dhead; /* * Unix communications domain. * * TODO: * SEQPACKET, RDM * rethink name space problems * need a proper out-of-band * lock pushdown */ static const struct sockaddr sun_noname = { sizeof(sun_noname), AF_LOCAL }; static ino_t unp_ino; /* prototype for fake inode numbers */ static struct mtx unp_mtx; #define UNP_LOCK_INIT() \ mtx_init(&unp_mtx, "unp", NULL, MTX_DEF) #define UNP_LOCK() mtx_lock(&unp_mtx) #define UNP_UNLOCK() mtx_unlock(&unp_mtx) #define UNP_LOCK_ASSERT() mtx_assert(&unp_mtx, MA_OWNED) static int unp_attach(struct socket *); static void unp_detach(struct unpcb *); static int unp_bind(struct unpcb *,struct sockaddr *, struct thread *); static int unp_connect(struct socket *,struct sockaddr *, struct thread *); static int unp_connect2(struct socket *so, struct socket *so2); static void unp_disconnect(struct unpcb *); static void unp_shutdown(struct unpcb *); static void unp_drop(struct unpcb *, int); static void unp_gc(void); static void unp_scan(struct mbuf *, void (*)(struct file *)); static void unp_mark(struct file *); static void unp_discard(struct file *); static void unp_freerights(struct file **, int); static int unp_internalize(struct mbuf **, struct thread *); static int unp_listen(struct unpcb *, struct thread *); static int uipc_abort(struct socket *so) { struct unpcb *unp = sotounpcb(so); if (unp == NULL) return (EINVAL); UNP_LOCK(); unp_drop(unp, ECONNABORTED); unp_detach(unp); /* NB: unlocks */ SOCK_LOCK(so); sotryfree(so); return (0); } static int uipc_accept(struct socket *so, struct sockaddr **nam) { struct unpcb *unp = sotounpcb(so); const struct sockaddr *sa; if (unp == NULL) return (EINVAL); /* * Pass back name of connected socket, * if it was bound and we are still connected * (our peer may have closed already!). */ *nam = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK); UNP_LOCK(); if (unp->unp_conn != NULL && unp->unp_conn->unp_addr != NULL) sa = (struct sockaddr *) unp->unp_conn->unp_addr; else sa = &sun_noname; bcopy(sa, *nam, sa->sa_len); UNP_UNLOCK(); return (0); } static int uipc_attach(struct socket *so, int proto, struct thread *td) { struct unpcb *unp = sotounpcb(so); if (unp != NULL) return (EISCONN); return (unp_attach(so)); } static int uipc_bind(struct socket *so, struct sockaddr *nam, struct thread *td) { struct unpcb *unp = sotounpcb(so); if (unp == NULL) return (EINVAL); return (unp_bind(unp, nam, td)); } static int uipc_connect(struct socket *so, struct sockaddr *nam, struct thread *td) { struct unpcb *unp = sotounpcb(so); int error; if (unp == NULL) return (EINVAL); UNP_LOCK(); error = unp_connect(so, nam, curthread); UNP_UNLOCK(); return (error); } int uipc_connect2(struct socket *so1, struct socket *so2) { struct unpcb *unp = sotounpcb(so1); int error; if (unp == NULL) return (EINVAL); UNP_LOCK(); error = unp_connect2(so1, so2); UNP_UNLOCK(); return (error); } /* control is EOPNOTSUPP */ static int uipc_detach(struct socket *so) { struct unpcb *unp = sotounpcb(so); if (unp == NULL) return (EINVAL); UNP_LOCK(); unp_detach(unp); /* NB: unlocks unp */ return (0); } static int uipc_disconnect(struct socket *so) { struct unpcb *unp = sotounpcb(so); if (unp == NULL) return (EINVAL); UNP_LOCK(); unp_disconnect(unp); UNP_UNLOCK(); return (0); } static int uipc_listen(struct socket *so, struct thread *td) { struct unpcb *unp = sotounpcb(so); int error; if (unp == NULL || unp->unp_vnode == NULL) return (EINVAL); UNP_LOCK(); error = unp_listen(unp, td); UNP_UNLOCK(); return (error); } static int uipc_peeraddr(struct socket *so, struct sockaddr **nam) { struct unpcb *unp = sotounpcb(so); const struct sockaddr *sa; if (unp == NULL) return (EINVAL); *nam = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK); UNP_LOCK(); if (unp->unp_conn != NULL && unp->unp_conn->unp_addr!= NULL) sa = (struct sockaddr *) unp->unp_conn->unp_addr; else { /* * XXX: It seems that this test always fails even when * connection is established. So, this else clause is * added as workaround to return PF_LOCAL sockaddr. */ sa = &sun_noname; } bcopy(sa, *nam, sa->sa_len); UNP_UNLOCK(); return (0); } static int uipc_rcvd(struct socket *so, int flags) { struct unpcb *unp = sotounpcb(so); struct socket *so2; u_long newhiwat; if (unp == NULL) return (EINVAL); UNP_LOCK(); switch (so->so_type) { case SOCK_DGRAM: panic("uipc_rcvd DGRAM?"); /*NOTREACHED*/ case SOCK_STREAM: if (unp->unp_conn == NULL) break; so2 = unp->unp_conn->unp_socket; SOCKBUF_LOCK(&so2->so_snd); SOCKBUF_LOCK(&so->so_rcv); /* * Adjust backpressure on sender * and wakeup any waiting to write. */ so2->so_snd.sb_mbmax += unp->unp_mbcnt - so->so_rcv.sb_mbcnt; unp->unp_mbcnt = so->so_rcv.sb_mbcnt; newhiwat = so2->so_snd.sb_hiwat + unp->unp_cc - so->so_rcv.sb_cc; (void)chgsbsize(so2->so_cred->cr_uidinfo, &so2->so_snd.sb_hiwat, newhiwat, RLIM_INFINITY); unp->unp_cc = so->so_rcv.sb_cc; SOCKBUF_UNLOCK(&so->so_rcv); - SOCKBUF_UNLOCK(&so2->so_snd); - sowwakeup(so2); + sowwakeup_locked(so2); break; default: panic("uipc_rcvd unknown socktype"); } UNP_UNLOCK(); return (0); } /* pru_rcvoob is EOPNOTSUPP */ static int uipc_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, struct mbuf *control, struct thread *td) { int error = 0; struct unpcb *unp = sotounpcb(so); struct socket *so2; u_long newhiwat; if (unp == NULL) { error = EINVAL; goto release; } if (flags & PRUS_OOB) { error = EOPNOTSUPP; goto release; } if (control != NULL && (error = unp_internalize(&control, td))) goto release; UNP_LOCK(); switch (so->so_type) { case SOCK_DGRAM: { const struct sockaddr *from; if (nam != NULL) { if (unp->unp_conn != NULL) { error = EISCONN; break; } error = unp_connect(so, nam, td); if (error) break; } else { if (unp->unp_conn == NULL) { error = ENOTCONN; break; } } so2 = unp->unp_conn->unp_socket; if (unp->unp_addr != NULL) from = (struct sockaddr *)unp->unp_addr; else from = &sun_noname; SOCKBUF_LOCK(&so2->so_rcv); if (sbappendaddr_locked(&so2->so_rcv, from, m, control)) { - SOCKBUF_UNLOCK(&so2->so_rcv); - sorwakeup(so2); + sorwakeup_locked(so2); m = NULL; control = NULL; } else { SOCKBUF_UNLOCK(&so2->so_rcv); error = ENOBUFS; } if (nam != NULL) unp_disconnect(unp); break; } case SOCK_STREAM: /* Connect if not connected yet. */ /* * Note: A better implementation would complain * if not equal to the peer's address. */ if ((so->so_state & SS_ISCONNECTED) == 0) { if (nam != NULL) { error = unp_connect(so, nam, td); if (error) break; /* XXX */ } else { error = ENOTCONN; break; } } if (so->so_snd.sb_state & SBS_CANTSENDMORE) { error = EPIPE; break; } if (unp->unp_conn == NULL) panic("uipc_send connected but no connection?"); so2 = unp->unp_conn->unp_socket; SOCKBUF_LOCK(&so2->so_rcv); /* * Send to paired receive port, and then reduce * send buffer hiwater marks to maintain backpressure. * Wake up readers. */ if (control != NULL) { if (sbappendcontrol_locked(&so2->so_rcv, m, control)) control = NULL; } else { sbappend_locked(&so2->so_rcv, m); } so->so_snd.sb_mbmax -= so2->so_rcv.sb_mbcnt - unp->unp_conn->unp_mbcnt; unp->unp_conn->unp_mbcnt = so2->so_rcv.sb_mbcnt; newhiwat = so->so_snd.sb_hiwat - (so2->so_rcv.sb_cc - unp->unp_conn->unp_cc); (void)chgsbsize(so->so_cred->cr_uidinfo, &so->so_snd.sb_hiwat, newhiwat, RLIM_INFINITY); unp->unp_conn->unp_cc = so2->so_rcv.sb_cc; - SOCKBUF_UNLOCK(&so2->so_rcv); - sorwakeup(so2); + sorwakeup_locked(so2); m = NULL; break; default: panic("uipc_send unknown socktype"); } /* * SEND_EOF is equivalent to a SEND followed by * a SHUTDOWN. */ if (flags & PRUS_EOF) { socantsendmore(so); unp_shutdown(unp); } UNP_UNLOCK(); if (control != NULL && error != 0) unp_dispose(control); release: if (control != NULL) m_freem(control); if (m != NULL) m_freem(m); return (error); } static int uipc_sense(struct socket *so, struct stat *sb) { struct unpcb *unp = sotounpcb(so); struct socket *so2; if (unp == NULL) return (EINVAL); UNP_LOCK(); sb->st_blksize = so->so_snd.sb_hiwat; if (so->so_type == SOCK_STREAM && unp->unp_conn != NULL) { so2 = unp->unp_conn->unp_socket; sb->st_blksize += so2->so_rcv.sb_cc; } sb->st_dev = NODEV; if (unp->unp_ino == 0) unp->unp_ino = (++unp_ino == 0) ? ++unp_ino : unp_ino; sb->st_ino = unp->unp_ino; UNP_UNLOCK(); return (0); } static int uipc_shutdown(struct socket *so) { struct unpcb *unp = sotounpcb(so); if (unp == NULL) return (EINVAL); UNP_LOCK(); socantsendmore(so); unp_shutdown(unp); UNP_UNLOCK(); return (0); } static int uipc_sockaddr(struct socket *so, struct sockaddr **nam) { struct unpcb *unp = sotounpcb(so); const struct sockaddr *sa; if (unp == NULL) return (EINVAL); *nam = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK); UNP_LOCK(); if (unp->unp_addr != NULL) sa = (struct sockaddr *) unp->unp_addr; else sa = &sun_noname; bcopy(sa, *nam, sa->sa_len); UNP_UNLOCK(); return (0); } struct pr_usrreqs uipc_usrreqs = { uipc_abort, uipc_accept, uipc_attach, uipc_bind, uipc_connect, uipc_connect2, pru_control_notsupp, uipc_detach, uipc_disconnect, uipc_listen, uipc_peeraddr, uipc_rcvd, pru_rcvoob_notsupp, uipc_send, uipc_sense, uipc_shutdown, uipc_sockaddr, sosend, soreceive, sopoll, pru_sosetlabel_null }; int uipc_ctloutput(so, sopt) struct socket *so; struct sockopt *sopt; { struct unpcb *unp = sotounpcb(so); struct xucred xu; int error; switch (sopt->sopt_dir) { case SOPT_GET: switch (sopt->sopt_name) { case LOCAL_PEERCRED: error = 0; UNP_LOCK(); if (unp->unp_flags & UNP_HAVEPC) xu = unp->unp_peercred; else { if (so->so_type == SOCK_STREAM) error = ENOTCONN; else error = EINVAL; } UNP_UNLOCK(); if (error == 0) error = sooptcopyout(sopt, &xu, sizeof(xu)); break; default: error = EOPNOTSUPP; break; } break; case SOPT_SET: default: error = EOPNOTSUPP; break; } return (error); } /* * Both send and receive buffers are allocated PIPSIZ bytes of buffering * for stream sockets, although the total for sender and receiver is * actually only PIPSIZ. * Datagram sockets really use the sendspace as the maximum datagram size, * and don't really want to reserve the sendspace. Their recvspace should * be large enough for at least one max-size datagram plus address. */ #ifndef PIPSIZ #define PIPSIZ 8192 #endif static u_long unpst_sendspace = PIPSIZ; static u_long unpst_recvspace = PIPSIZ; static u_long unpdg_sendspace = 2*1024; /* really max datagram size */ static u_long unpdg_recvspace = 4*1024; static int unp_rights; /* file descriptors in flight */ SYSCTL_DECL(_net_local_stream); SYSCTL_INT(_net_local_stream, OID_AUTO, sendspace, CTLFLAG_RW, &unpst_sendspace, 0, ""); SYSCTL_INT(_net_local_stream, OID_AUTO, recvspace, CTLFLAG_RW, &unpst_recvspace, 0, ""); SYSCTL_DECL(_net_local_dgram); SYSCTL_INT(_net_local_dgram, OID_AUTO, maxdgram, CTLFLAG_RW, &unpdg_sendspace, 0, ""); SYSCTL_INT(_net_local_dgram, OID_AUTO, recvspace, CTLFLAG_RW, &unpdg_recvspace, 0, ""); SYSCTL_DECL(_net_local); SYSCTL_INT(_net_local, OID_AUTO, inflight, CTLFLAG_RD, &unp_rights, 0, ""); static int unp_attach(so) struct socket *so; { register struct unpcb *unp; int error; if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) { switch (so->so_type) { case SOCK_STREAM: error = soreserve(so, unpst_sendspace, unpst_recvspace); break; case SOCK_DGRAM: error = soreserve(so, unpdg_sendspace, unpdg_recvspace); break; default: panic("unp_attach"); } if (error) return (error); } unp = uma_zalloc(unp_zone, M_WAITOK); if (unp == NULL) return (ENOBUFS); bzero(unp, sizeof *unp); LIST_INIT(&unp->unp_refs); unp->unp_socket = so; UNP_LOCK(); unp->unp_gencnt = ++unp_gencnt; unp_count++; LIST_INSERT_HEAD(so->so_type == SOCK_DGRAM ? &unp_dhead : &unp_shead, unp, unp_link); UNP_UNLOCK(); so->so_pcb = unp; return (0); } static void unp_detach(unp) register struct unpcb *unp; { struct vnode *vp; UNP_LOCK_ASSERT(); LIST_REMOVE(unp, unp_link); unp->unp_gencnt = ++unp_gencnt; --unp_count; if ((vp = unp->unp_vnode) != NULL) { /* * XXXRW: should v_socket be frobbed only while holding * Giant? */ unp->unp_vnode->v_socket = NULL; unp->unp_vnode = NULL; } if (unp->unp_conn != NULL) unp_disconnect(unp); while (!LIST_EMPTY(&unp->unp_refs)) { struct unpcb *ref = LIST_FIRST(&unp->unp_refs); unp_drop(ref, ECONNRESET); } soisdisconnected(unp->unp_socket); unp->unp_socket->so_pcb = NULL; if (unp_rights) { /* * Normally the receive buffer is flushed later, * in sofree, but if our receive buffer holds references * to descriptors that are now garbage, we will dispose * of those descriptor references after the garbage collector * gets them (resulting in a "panic: closef: count < 0"). */ sorflush(unp->unp_socket); unp_gc(); } UNP_UNLOCK(); if (unp->unp_addr != NULL) FREE(unp->unp_addr, M_SONAME); uma_zfree(unp_zone, unp); if (vp) { mtx_lock(&Giant); vrele(vp); mtx_unlock(&Giant); } } static int unp_bind(unp, nam, td) struct unpcb *unp; struct sockaddr *nam; struct thread *td; { struct sockaddr_un *soun = (struct sockaddr_un *)nam; struct vnode *vp; struct mount *mp; struct vattr vattr; int error, namelen; struct nameidata nd; char *buf; /* * XXXRW: This test-and-set of unp_vnode is non-atomic; the * unlocked read here is fine, but the value of unp_vnode needs * to be tested again after we do all the lookups to see if the * pcb is still unbound? */ if (unp->unp_vnode != NULL) return (EINVAL); namelen = soun->sun_len - offsetof(struct sockaddr_un, sun_path); if (namelen <= 0) return (EINVAL); buf = malloc(namelen + 1, M_TEMP, M_WAITOK); strlcpy(buf, soun->sun_path, namelen + 1); mtx_lock(&Giant); restart: mtx_assert(&Giant, MA_OWNED); NDINIT(&nd, CREATE, NOFOLLOW | LOCKPARENT | SAVENAME, UIO_SYSSPACE, buf, td); /* SHOULD BE ABLE TO ADOPT EXISTING AND wakeup() ALA FIFO's */ error = namei(&nd); if (error) goto done; vp = nd.ni_vp; if (vp != NULL || vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { NDFREE(&nd, NDF_ONLY_PNBUF); if (nd.ni_dvp == vp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); if (vp != NULL) { vrele(vp); error = EADDRINUSE; goto done; } error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH); if (error) goto done; goto restart; } VATTR_NULL(&vattr); vattr.va_type = VSOCK; vattr.va_mode = (ACCESSPERMS & ~td->td_proc->p_fd->fd_cmask); #ifdef MAC error = mac_check_vnode_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd, &vattr); #endif if (error == 0) { VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE); error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr); } NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); if (error) goto done; vp = nd.ni_vp; ASSERT_VOP_LOCKED(vp, "unp_bind"); soun = (struct sockaddr_un *)sodupsockaddr(nam, M_WAITOK); UNP_LOCK(); vp->v_socket = unp->unp_socket; unp->unp_vnode = vp; unp->unp_addr = soun; UNP_UNLOCK(); VOP_UNLOCK(vp, 0, td); vn_finished_write(mp); done: mtx_unlock(&Giant); free(buf, M_TEMP); return (error); } static int unp_connect(so, nam, td) struct socket *so; struct sockaddr *nam; struct thread *td; { register struct sockaddr_un *soun = (struct sockaddr_un *)nam; register struct vnode *vp; register struct socket *so2, *so3; struct unpcb *unp = sotounpcb(so); struct unpcb *unp2, *unp3; int error, len; struct nameidata nd; char buf[SOCK_MAXADDRLEN]; struct sockaddr *sa; UNP_LOCK_ASSERT(); len = nam->sa_len - offsetof(struct sockaddr_un, sun_path); if (len <= 0) return (EINVAL); strlcpy(buf, soun->sun_path, len + 1); UNP_UNLOCK(); sa = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK); mtx_lock(&Giant); NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, buf, td); error = namei(&nd); if (error) vp = NULL; else vp = nd.ni_vp; ASSERT_VOP_LOCKED(vp, "unp_connect"); NDFREE(&nd, NDF_ONLY_PNBUF); if (error) goto bad; if (vp->v_type != VSOCK) { error = ENOTSOCK; goto bad; } error = VOP_ACCESS(vp, VWRITE, td->td_ucred, td); if (error) goto bad; so2 = vp->v_socket; if (so2 == NULL) { error = ECONNREFUSED; goto bad; } if (so->so_type != so2->so_type) { error = EPROTOTYPE; goto bad; } mtx_unlock(&Giant); UNP_LOCK(); if (so->so_proto->pr_flags & PR_CONNREQUIRED) { if (so2->so_options & SO_ACCEPTCONN) { /* * NB: drop locks here so unp_attach is entered * w/o locks; this avoids a recursive lock * of the head and holding sleep locks across * a (potentially) blocking malloc. */ UNP_UNLOCK(); so3 = sonewconn(so2, 0); UNP_LOCK(); } else so3 = NULL; if (so3 == NULL) { error = ECONNREFUSED; goto bad2; } unp = sotounpcb(so); unp2 = sotounpcb(so2); unp3 = sotounpcb(so3); if (unp2->unp_addr != NULL) { bcopy(unp2->unp_addr, sa, unp2->unp_addr->sun_len); unp3->unp_addr = (struct sockaddr_un *) sa; sa = NULL; } /* * unp_peercred management: * * The connecter's (client's) credentials are copied * from its process structure at the time of connect() * (which is now). */ cru2x(td->td_ucred, &unp3->unp_peercred); unp3->unp_flags |= UNP_HAVEPC; /* * The receiver's (server's) credentials are copied * from the unp_peercred member of socket on which the * former called listen(); unp_listen() cached that * process's credentials at that time so we can use * them now. */ KASSERT(unp2->unp_flags & UNP_HAVEPCCACHED, ("unp_connect: listener without cached peercred")); memcpy(&unp->unp_peercred, &unp2->unp_peercred, sizeof(unp->unp_peercred)); unp->unp_flags |= UNP_HAVEPC; #ifdef MAC SOCK_LOCK(so); mac_set_socket_peer_from_socket(so, so3); mac_set_socket_peer_from_socket(so3, so); SOCK_UNLOCK(so); #endif so2 = so3; } error = unp_connect2(so, so2); bad2: UNP_UNLOCK(); mtx_lock(&Giant); bad: mtx_assert(&Giant, MA_OWNED); if (vp != NULL) vput(vp); mtx_unlock(&Giant); free(sa, M_SONAME); UNP_LOCK(); return (error); } static int unp_connect2(so, so2) register struct socket *so; register struct socket *so2; { register struct unpcb *unp = sotounpcb(so); register struct unpcb *unp2; UNP_LOCK_ASSERT(); if (so2->so_type != so->so_type) return (EPROTOTYPE); unp2 = sotounpcb(so2); unp->unp_conn = unp2; switch (so->so_type) { case SOCK_DGRAM: LIST_INSERT_HEAD(&unp2->unp_refs, unp, unp_reflink); soisconnected(so); break; case SOCK_STREAM: unp2->unp_conn = unp; soisconnected(so); soisconnected(so2); break; default: panic("unp_connect2"); } return (0); } static void unp_disconnect(unp) struct unpcb *unp; { register struct unpcb *unp2 = unp->unp_conn; struct socket *so; UNP_LOCK_ASSERT(); if (unp2 == NULL) return; unp->unp_conn = NULL; switch (unp->unp_socket->so_type) { case SOCK_DGRAM: LIST_REMOVE(unp, unp_reflink); so = unp->unp_socket; SOCK_LOCK(so); so->so_state &= ~SS_ISCONNECTED; SOCK_UNLOCK(so); break; case SOCK_STREAM: soisdisconnected(unp->unp_socket); unp2->unp_conn = NULL; soisdisconnected(unp2->unp_socket); break; } } #ifdef notdef void unp_abort(unp) struct unpcb *unp; { unp_detach(unp); } #endif /* * unp_pcblist() assumes that UNIX domain socket memory is never reclaimed * by the zone (UMA_ZONE_NOFREE), and as such potentially stale pointers * are safe to reference. It first scans the list of struct unpcb's to * generate a pointer list, then it rescans its list one entry at a time to * externalize and copyout. It checks the generation number to see if a * struct unpcb has been reused, and will skip it if so. */ static int unp_pcblist(SYSCTL_HANDLER_ARGS) { int error, i, n; struct unpcb *unp, **unp_list; unp_gen_t gencnt; struct xunpgen *xug; struct unp_head *head; struct xunpcb *xu; head = ((intptr_t)arg1 == SOCK_DGRAM ? &unp_dhead : &unp_shead); /* * The process of preparing the PCB list is too time-consuming and * resource-intensive to repeat twice on every request. */ if (req->oldptr == NULL) { n = unp_count; req->oldidx = 2 * (sizeof *xug) + (n + n/8) * sizeof(struct xunpcb); return (0); } if (req->newptr != NULL) return (EPERM); /* * OK, now we're committed to doing something. */ xug = malloc(sizeof(*xug), M_TEMP, M_WAITOK); UNP_LOCK(); gencnt = unp_gencnt; n = unp_count; UNP_UNLOCK(); xug->xug_len = sizeof *xug; xug->xug_count = n; xug->xug_gen = gencnt; xug->xug_sogen = so_gencnt; error = SYSCTL_OUT(req, xug, sizeof *xug); if (error) { free(xug, M_TEMP); return (error); } unp_list = malloc(n * sizeof *unp_list, M_TEMP, M_WAITOK); UNP_LOCK(); for (unp = LIST_FIRST(head), i = 0; unp && i < n; unp = LIST_NEXT(unp, unp_link)) { if (unp->unp_gencnt <= gencnt) { if (cr_cansee(req->td->td_ucred, unp->unp_socket->so_cred)) continue; unp_list[i++] = unp; } } UNP_UNLOCK(); n = i; /* in case we lost some during malloc */ error = 0; xu = malloc(sizeof(*xu), M_TEMP, M_WAITOK); for (i = 0; i < n; i++) { unp = unp_list[i]; if (unp->unp_gencnt <= gencnt) { xu->xu_len = sizeof *xu; xu->xu_unpp = unp; /* * XXX - need more locking here to protect against * connect/disconnect races for SMP. */ if (unp->unp_addr != NULL) bcopy(unp->unp_addr, &xu->xu_addr, unp->unp_addr->sun_len); if (unp->unp_conn != NULL && unp->unp_conn->unp_addr != NULL) bcopy(unp->unp_conn->unp_addr, &xu->xu_caddr, unp->unp_conn->unp_addr->sun_len); bcopy(unp, &xu->xu_unp, sizeof *unp); sotoxsocket(unp->unp_socket, &xu->xu_socket); error = SYSCTL_OUT(req, xu, sizeof *xu); } } free(xu, M_TEMP); if (!error) { /* * Give the user an updated idea of our state. * If the generation differs from what we told * her before, she knows that something happened * while we were processing this request, and it * might be necessary to retry. */ xug->xug_gen = unp_gencnt; xug->xug_sogen = so_gencnt; xug->xug_count = unp_count; error = SYSCTL_OUT(req, xug, sizeof *xug); } free(unp_list, M_TEMP); free(xug, M_TEMP); return (error); } SYSCTL_PROC(_net_local_dgram, OID_AUTO, pcblist, CTLFLAG_RD, (caddr_t)(long)SOCK_DGRAM, 0, unp_pcblist, "S,xunpcb", "List of active local datagram sockets"); SYSCTL_PROC(_net_local_stream, OID_AUTO, pcblist, CTLFLAG_RD, (caddr_t)(long)SOCK_STREAM, 0, unp_pcblist, "S,xunpcb", "List of active local stream sockets"); static void unp_shutdown(unp) struct unpcb *unp; { struct socket *so; UNP_LOCK_ASSERT(); if (unp->unp_socket->so_type == SOCK_STREAM && unp->unp_conn && (so = unp->unp_conn->unp_socket)) socantrcvmore(so); } static void unp_drop(unp, errno) struct unpcb *unp; int errno; { struct socket *so = unp->unp_socket; UNP_LOCK_ASSERT(); so->so_error = errno; unp_disconnect(unp); } #ifdef notdef void unp_drain() { } #endif static void unp_freerights(rp, fdcount) struct file **rp; int fdcount; { int i; struct file *fp; for (i = 0; i < fdcount; i++) { fp = *rp; /* * zero the pointer before calling * unp_discard since it may end up * in unp_gc().. */ *rp++ = 0; unp_discard(fp); } } int unp_externalize(control, controlp) struct mbuf *control, **controlp; { struct thread *td = curthread; /* XXX */ struct cmsghdr *cm = mtod(control, struct cmsghdr *); int i; int *fdp; struct file **rp; struct file *fp; void *data; socklen_t clen = control->m_len, datalen; int error, newfds; int f; u_int newlen; error = 0; if (controlp != NULL) /* controlp == NULL => free control messages */ *controlp = NULL; while (cm != NULL) { if (sizeof(*cm) > clen || cm->cmsg_len > clen) { error = EINVAL; break; } data = CMSG_DATA(cm); datalen = (caddr_t)cm + cm->cmsg_len - (caddr_t)data; if (cm->cmsg_level == SOL_SOCKET && cm->cmsg_type == SCM_RIGHTS) { newfds = datalen / sizeof(struct file *); rp = data; /* If we're not outputting the descriptors free them. */ if (error || controlp == NULL) { unp_freerights(rp, newfds); goto next; } FILEDESC_LOCK(td->td_proc->p_fd); /* if the new FD's will not fit free them. */ if (!fdavail(td, newfds)) { FILEDESC_UNLOCK(td->td_proc->p_fd); error = EMSGSIZE; unp_freerights(rp, newfds); goto next; } /* * now change each pointer to an fd in the global * table to an integer that is the index to the * local fd table entry that we set up to point * to the global one we are transferring. */ newlen = newfds * sizeof(int); *controlp = sbcreatecontrol(NULL, newlen, SCM_RIGHTS, SOL_SOCKET); if (*controlp == NULL) { FILEDESC_UNLOCK(td->td_proc->p_fd); error = E2BIG; unp_freerights(rp, newfds); goto next; } fdp = (int *) CMSG_DATA(mtod(*controlp, struct cmsghdr *)); for (i = 0; i < newfds; i++) { if (fdalloc(td, 0, &f)) panic("unp_externalize fdalloc failed"); fp = *rp++; td->td_proc->p_fd->fd_ofiles[f] = fp; FILE_LOCK(fp); fp->f_msgcount--; FILE_UNLOCK(fp); unp_rights--; *fdp++ = f; } FILEDESC_UNLOCK(td->td_proc->p_fd); } else { /* We can just copy anything else across */ if (error || controlp == NULL) goto next; *controlp = sbcreatecontrol(NULL, datalen, cm->cmsg_type, cm->cmsg_level); if (*controlp == NULL) { error = ENOBUFS; goto next; } bcopy(data, CMSG_DATA(mtod(*controlp, struct cmsghdr *)), datalen); } controlp = &(*controlp)->m_next; next: if (CMSG_SPACE(datalen) < clen) { clen -= CMSG_SPACE(datalen); cm = (struct cmsghdr *) ((caddr_t)cm + CMSG_SPACE(datalen)); } else { clen = 0; cm = NULL; } } m_freem(control); return (error); } void unp_init(void) { unp_zone = uma_zcreate("unpcb", sizeof(struct unpcb), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); if (unp_zone == NULL) panic("unp_init"); uma_zone_set_max(unp_zone, nmbclusters); LIST_INIT(&unp_dhead); LIST_INIT(&unp_shead); UNP_LOCK_INIT(); } static int unp_internalize(controlp, td) struct mbuf **controlp; struct thread *td; { struct mbuf *control = *controlp; struct proc *p = td->td_proc; struct filedesc *fdescp = p->p_fd; struct cmsghdr *cm = mtod(control, struct cmsghdr *); struct cmsgcred *cmcred; struct file **rp; struct file *fp; struct timeval *tv; int i, fd, *fdp; void *data; socklen_t clen = control->m_len, datalen; int error, oldfds; u_int newlen; error = 0; *controlp = NULL; while (cm != NULL) { if (sizeof(*cm) > clen || cm->cmsg_level != SOL_SOCKET || cm->cmsg_len > clen) { error = EINVAL; goto out; } data = CMSG_DATA(cm); datalen = (caddr_t)cm + cm->cmsg_len - (caddr_t)data; switch (cm->cmsg_type) { /* * Fill in credential information. */ case SCM_CREDS: *controlp = sbcreatecontrol(NULL, sizeof(*cmcred), SCM_CREDS, SOL_SOCKET); if (*controlp == NULL) { error = ENOBUFS; goto out; } cmcred = (struct cmsgcred *) CMSG_DATA(mtod(*controlp, struct cmsghdr *)); cmcred->cmcred_pid = p->p_pid; cmcred->cmcred_uid = td->td_ucred->cr_ruid; cmcred->cmcred_gid = td->td_ucred->cr_rgid; cmcred->cmcred_euid = td->td_ucred->cr_uid; cmcred->cmcred_ngroups = MIN(td->td_ucred->cr_ngroups, CMGROUP_MAX); for (i = 0; i < cmcred->cmcred_ngroups; i++) cmcred->cmcred_groups[i] = td->td_ucred->cr_groups[i]; break; case SCM_RIGHTS: oldfds = datalen / sizeof (int); /* * check that all the FDs passed in refer to legal files * If not, reject the entire operation. */ fdp = data; FILEDESC_LOCK(fdescp); for (i = 0; i < oldfds; i++) { fd = *fdp++; if ((unsigned)fd >= fdescp->fd_nfiles || fdescp->fd_ofiles[fd] == NULL) { FILEDESC_UNLOCK(fdescp); error = EBADF; goto out; } fp = fdescp->fd_ofiles[fd]; if (!(fp->f_ops->fo_flags & DFLAG_PASSABLE)) { FILEDESC_UNLOCK(fdescp); error = EOPNOTSUPP; goto out; } } /* * Now replace the integer FDs with pointers to * the associated global file table entry.. */ newlen = oldfds * sizeof(struct file *); *controlp = sbcreatecontrol(NULL, newlen, SCM_RIGHTS, SOL_SOCKET); if (*controlp == NULL) { FILEDESC_UNLOCK(fdescp); error = E2BIG; goto out; } fdp = data; rp = (struct file **) CMSG_DATA(mtod(*controlp, struct cmsghdr *)); for (i = 0; i < oldfds; i++) { fp = fdescp->fd_ofiles[*fdp++]; *rp++ = fp; FILE_LOCK(fp); fp->f_count++; fp->f_msgcount++; FILE_UNLOCK(fp); unp_rights++; } FILEDESC_UNLOCK(fdescp); break; case SCM_TIMESTAMP: *controlp = sbcreatecontrol(NULL, sizeof(*tv), SCM_TIMESTAMP, SOL_SOCKET); if (*controlp == NULL) { error = ENOBUFS; goto out; } tv = (struct timeval *) CMSG_DATA(mtod(*controlp, struct cmsghdr *)); microtime(tv); break; default: error = EINVAL; goto out; } controlp = &(*controlp)->m_next; if (CMSG_SPACE(datalen) < clen) { clen -= CMSG_SPACE(datalen); cm = (struct cmsghdr *) ((caddr_t)cm + CMSG_SPACE(datalen)); } else { clen = 0; cm = NULL; } } out: m_freem(control); return (error); } static int unp_defer, unp_gcing; static void unp_gc() { register struct file *fp, *nextfp; register struct socket *so; struct file **extra_ref, **fpp; int nunref, i; UNP_LOCK_ASSERT(); if (unp_gcing) return; unp_gcing = 1; unp_defer = 0; /* * before going through all this, set all FDs to * be NOT defered and NOT externally accessible */ /* * XXXRW: Acquiring a sleep lock while holding UNP * mutex cannot be a good thing. */ sx_slock(&filelist_lock); LIST_FOREACH(fp, &filehead, f_list) fp->f_gcflag &= ~(FMARK|FDEFER); do { LIST_FOREACH(fp, &filehead, f_list) { FILE_LOCK(fp); /* * If the file is not open, skip it */ if (fp->f_count == 0) { FILE_UNLOCK(fp); continue; } /* * If we already marked it as 'defer' in a * previous pass, then try process it this time * and un-mark it */ if (fp->f_gcflag & FDEFER) { fp->f_gcflag &= ~FDEFER; unp_defer--; } else { /* * if it's not defered, then check if it's * already marked.. if so skip it */ if (fp->f_gcflag & FMARK) { FILE_UNLOCK(fp); continue; } /* * If all references are from messages * in transit, then skip it. it's not * externally accessible. */ if (fp->f_count == fp->f_msgcount) { FILE_UNLOCK(fp); continue; } /* * If it got this far then it must be * externally accessible. */ fp->f_gcflag |= FMARK; } /* * either it was defered, or it is externally * accessible and not already marked so. * Now check if it is possibly one of OUR sockets. */ if (fp->f_type != DTYPE_SOCKET || (so = fp->f_data) == NULL) { FILE_UNLOCK(fp); continue; } FILE_UNLOCK(fp); if (so->so_proto->pr_domain != &localdomain || (so->so_proto->pr_flags&PR_RIGHTS) == 0) continue; #ifdef notdef if (so->so_rcv.sb_flags & SB_LOCK) { /* * This is problematical; it's not clear * we need to wait for the sockbuf to be * unlocked (on a uniprocessor, at least), * and it's also not clear what to do * if sbwait returns an error due to receipt * of a signal. If sbwait does return * an error, we'll go into an infinite * loop. Delete all of this for now. */ (void) sbwait(&so->so_rcv); goto restart; } #endif /* * So, Ok, it's one of our sockets and it IS externally * accessible (or was defered). Now we look * to see if we hold any file descriptors in its * message buffers. Follow those links and mark them * as accessible too. */ unp_scan(so->so_rcv.sb_mb, unp_mark); } } while (unp_defer); sx_sunlock(&filelist_lock); /* * We grab an extra reference to each of the file table entries * that are not otherwise accessible and then free the rights * that are stored in messages on them. * * The bug in the orginal code is a little tricky, so I'll describe * what's wrong with it here. * * It is incorrect to simply unp_discard each entry for f_msgcount * times -- consider the case of sockets A and B that contain * references to each other. On a last close of some other socket, * we trigger a gc since the number of outstanding rights (unp_rights) * is non-zero. If during the sweep phase the gc code un_discards, * we end up doing a (full) closef on the descriptor. A closef on A * results in the following chain. Closef calls soo_close, which * calls soclose. Soclose calls first (through the switch * uipc_usrreq) unp_detach, which re-invokes unp_gc. Unp_gc simply * returns because the previous instance had set unp_gcing, and * we return all the way back to soclose, which marks the socket * with SS_NOFDREF, and then calls sofree. Sofree calls sorflush * to free up the rights that are queued in messages on the socket A, * i.e., the reference on B. The sorflush calls via the dom_dispose * switch unp_dispose, which unp_scans with unp_discard. This second * instance of unp_discard just calls closef on B. * * Well, a similar chain occurs on B, resulting in a sorflush on B, * which results in another closef on A. Unfortunately, A is already * being closed, and the descriptor has already been marked with * SS_NOFDREF, and soclose panics at this point. * * Here, we first take an extra reference to each inaccessible * descriptor. Then, we call sorflush ourself, since we know * it is a Unix domain socket anyhow. After we destroy all the * rights carried in messages, we do a last closef to get rid * of our extra reference. This is the last close, and the * unp_detach etc will shut down the socket. * * 91/09/19, bsy@cs.cmu.edu */ extra_ref = malloc(nfiles * sizeof(struct file *), M_TEMP, M_WAITOK); sx_slock(&filelist_lock); for (nunref = 0, fp = LIST_FIRST(&filehead), fpp = extra_ref; fp != NULL; fp = nextfp) { nextfp = LIST_NEXT(fp, f_list); FILE_LOCK(fp); /* * If it's not open, skip it */ if (fp->f_count == 0) { FILE_UNLOCK(fp); continue; } /* * If all refs are from msgs, and it's not marked accessible * then it must be referenced from some unreachable cycle * of (shut-down) FDs, so include it in our * list of FDs to remove */ if (fp->f_count == fp->f_msgcount && !(fp->f_gcflag & FMARK)) { *fpp++ = fp; nunref++; fp->f_count++; } FILE_UNLOCK(fp); } sx_sunlock(&filelist_lock); /* * for each FD on our hit list, do the following two things */ for (i = nunref, fpp = extra_ref; --i >= 0; ++fpp) { struct file *tfp = *fpp; FILE_LOCK(tfp); if (tfp->f_type == DTYPE_SOCKET && tfp->f_data != NULL) { FILE_UNLOCK(tfp); sorflush(tfp->f_data); } else { FILE_UNLOCK(tfp); } } for (i = nunref, fpp = extra_ref; --i >= 0; ++fpp) closef(*fpp, (struct thread *) NULL); free(extra_ref, M_TEMP); unp_gcing = 0; } void unp_dispose(m) struct mbuf *m; { if (m) unp_scan(m, unp_discard); } static int unp_listen(unp, td) struct unpcb *unp; struct thread *td; { UNP_LOCK_ASSERT(); /* * XXXRW: Why populate the local peer cred with our own credential? */ cru2x(td->td_ucred, &unp->unp_peercred); unp->unp_flags |= UNP_HAVEPCCACHED; return (0); } static void unp_scan(m0, op) register struct mbuf *m0; void (*op)(struct file *); { struct mbuf *m; struct file **rp; struct cmsghdr *cm; void *data; int i; socklen_t clen, datalen; int qfds; while (m0 != NULL) { for (m = m0; m; m = m->m_next) { if (m->m_type != MT_CONTROL) continue; cm = mtod(m, struct cmsghdr *); clen = m->m_len; while (cm != NULL) { if (sizeof(*cm) > clen || cm->cmsg_len > clen) break; data = CMSG_DATA(cm); datalen = (caddr_t)cm + cm->cmsg_len - (caddr_t)data; if (cm->cmsg_level == SOL_SOCKET && cm->cmsg_type == SCM_RIGHTS) { qfds = datalen / sizeof (struct file *); rp = data; for (i = 0; i < qfds; i++) (*op)(*rp++); } if (CMSG_SPACE(datalen) < clen) { clen -= CMSG_SPACE(datalen); cm = (struct cmsghdr *) ((caddr_t)cm + CMSG_SPACE(datalen)); } else { clen = 0; cm = NULL; } } } m0 = m0->m_act; } } static void unp_mark(fp) struct file *fp; { if (fp->f_gcflag & FMARK) return; unp_defer++; fp->f_gcflag |= (FMARK|FDEFER); } static void unp_discard(fp) struct file *fp; { FILE_LOCK(fp); fp->f_msgcount--; unp_rights--; FILE_UNLOCK(fp); (void) closef(fp, (struct thread *)NULL); } Index: head/sys/netinet/ip_divert.c =================================================================== --- head/sys/netinet/ip_divert.c (revision 131150) +++ head/sys/netinet/ip_divert.c (revision 131151) @@ -1,670 +1,672 @@ /* * Copyright (c) 1982, 1986, 1988, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include "opt_inet.h" #include "opt_ipfw.h" #include "opt_ipdivert.h" #include "opt_ipsec.h" #include "opt_mac.h" #ifndef INET #error "IPDIVERT requires INET." #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Divert sockets */ /* * Allocate enough space to hold a full IP packet */ #define DIVSNDQ (65536 + 100) #define DIVRCVQ (65536 + 100) /* * Divert sockets work in conjunction with ipfw, see the divert(4) * manpage for features. * Internally, packets selected by ipfw in ip_input() or ip_output(), * and never diverted before, are passed to the input queue of the * divert socket with a given 'divert_port' number (as specified in * the matching ipfw rule), and they are tagged with a 16 bit cookie * (representing the rule number of the matching ipfw rule), which * is passed to process reading from the socket. * * Packets written to the divert socket are again tagged with a cookie * (usually the same as above) and a destination address. * If the destination address is INADDR_ANY then the packet is * treated as outgoing and sent to ip_output(), otherwise it is * treated as incoming and sent to ip_input(). * In both cases, the packet is tagged with the cookie. * * On reinjection, processing in ip_input() and ip_output() * will be exactly the same as for the original packet, except that * ipfw processing will start at the rule number after the one * written in the cookie (so, tagging a packet with a cookie of 0 * will cause it to be effectively considered as a standard packet). */ /* Internal variables */ static struct inpcbhead divcb; static struct inpcbinfo divcbinfo; static u_long div_sendspace = DIVSNDQ; /* XXX sysctl ? */ static u_long div_recvspace = DIVRCVQ; /* XXX sysctl ? */ /* * Initialize divert connection block queue. */ void div_init(void) { INP_INFO_LOCK_INIT(&divcbinfo, "div"); LIST_INIT(&divcb); divcbinfo.listhead = &divcb; /* * XXX We don't use the hash list for divert IP, but it's easier * to allocate a one entry hash list than it is to check all * over the place for hashbase == NULL. */ divcbinfo.hashbase = hashinit(1, M_PCB, &divcbinfo.hashmask); divcbinfo.porthashbase = hashinit(1, M_PCB, &divcbinfo.porthashmask); divcbinfo.ipi_zone = uma_zcreate("divcb", sizeof(struct inpcb), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); uma_zone_set_max(divcbinfo.ipi_zone, maxsockets); } /* * IPPROTO_DIVERT is not in the real IP protocol number space; this * function should never be called. Just in case, drop any packets. */ void div_input(struct mbuf *m, int off) { ipstat.ips_noproto++; m_freem(m); } /* * Divert a packet by passing it up to the divert socket at port 'port'. * * Setup generic address and protocol structures for div_input routine, * then pass them along with mbuf chain. */ void divert_packet(struct mbuf *m, int incoming) { struct ip *ip; struct inpcb *inp; struct socket *sa; u_int16_t nport; struct sockaddr_in divsrc; struct m_tag *mtag; mtag = m_tag_find(m, PACKET_TAG_DIVERT, NULL); if (mtag == NULL) { printf("%s: no divert tag\n", __func__); m_freem(m); return; } /* Assure header */ if (m->m_len < sizeof(struct ip) && (m = m_pullup(m, sizeof(struct ip))) == 0) return; ip = mtod(m, struct ip *); /* * Record receive interface address, if any. * But only for incoming packets. */ bzero(&divsrc, sizeof(divsrc)); divsrc.sin_len = sizeof(divsrc); divsrc.sin_family = AF_INET; divsrc.sin_port = divert_cookie(mtag); /* record matching rule */ if (incoming) { struct ifaddr *ifa; /* Sanity check */ M_ASSERTPKTHDR(m); /* Find IP address for receive interface */ TAILQ_FOREACH(ifa, &m->m_pkthdr.rcvif->if_addrhead, ifa_link) { if (ifa->ifa_addr == NULL) continue; if (ifa->ifa_addr->sa_family != AF_INET) continue; divsrc.sin_addr = ((struct sockaddr_in *) ifa->ifa_addr)->sin_addr; break; } } /* * Record the incoming interface name whenever we have one. */ if (m->m_pkthdr.rcvif) { /* * Hide the actual interface name in there in the * sin_zero array. XXX This needs to be moved to a * different sockaddr type for divert, e.g. * sockaddr_div with multiple fields like * sockaddr_dl. Presently we have only 7 bytes * but that will do for now as most interfaces * are 4 or less + 2 or less bytes for unit. * There is probably a faster way of doing this, * possibly taking it from the sockaddr_dl on the iface. * This solves the problem of a P2P link and a LAN interface * having the same address, which can result in the wrong * interface being assigned to the packet when fed back * into the divert socket. Theoretically if the daemon saves * and re-uses the sockaddr_in as suggested in the man pages, * this iface name will come along for the ride. * (see div_output for the other half of this.) */ strlcpy(divsrc.sin_zero, m->m_pkthdr.rcvif->if_xname, sizeof(divsrc.sin_zero)); } /* Put packet on socket queue, if any */ sa = NULL; nport = htons((u_int16_t)divert_info(mtag)); INP_INFO_RLOCK(&divcbinfo); LIST_FOREACH(inp, &divcb, inp_list) { INP_LOCK(inp); /* XXX why does only one socket match? */ if (inp->inp_lport == nport) { sa = inp->inp_socket; - if (sbappendaddr(&sa->so_rcv, + SOCKBUF_LOCK(&sa->so_rcv); + if (sbappendaddr_locked(&sa->so_rcv, (struct sockaddr *)&divsrc, m, - (struct mbuf *)0) == 0) + (struct mbuf *)0) == 0) { sa = NULL; /* force mbuf reclaim below */ - else - sorwakeup(sa); + SOCKBUF_UNLOCK(&sa->so_rcv); + } else + sorwakeup_locked(sa); INP_UNLOCK(inp); break; } INP_UNLOCK(inp); } INP_INFO_RUNLOCK(&divcbinfo); if (sa == NULL) { m_freem(m); ipstat.ips_noproto++; ipstat.ips_delivered--; } } /* * Deliver packet back into the IP processing machinery. * * If no address specified, or address is 0.0.0.0, send to ip_output(); * otherwise, send to ip_input() and mark as having been received on * the interface with that address. */ static int div_output(struct socket *so, struct mbuf *m, struct sockaddr_in *sin, struct mbuf *control) { int error = 0; KASSERT(m->m_pkthdr.rcvif == NULL, ("rcvif not null")); if (control) m_freem(control); /* XXX */ /* Loopback avoidance and state recovery */ if (sin) { struct m_tag *mtag; struct divert_tag *dt; int i; mtag = m_tag_get(PACKET_TAG_DIVERT, sizeof(struct divert_tag), M_NOWAIT); if (mtag == NULL) { error = ENOBUFS; goto cantsend; } dt = (struct divert_tag *)(mtag+1); dt->info = 0; dt->cookie = sin->sin_port; m_tag_prepend(m, mtag); /* * Find receive interface with the given name, stuffed * (if it exists) in the sin_zero[] field. * The name is user supplied data so don't trust its size * or that it is zero terminated. */ for (i = 0; i < sizeof(sin->sin_zero) && sin->sin_zero[i]; i++) ; if ( i > 0 && i < sizeof(sin->sin_zero)) m->m_pkthdr.rcvif = ifunit(sin->sin_zero); } /* Reinject packet into the system as incoming or outgoing */ if (!sin || sin->sin_addr.s_addr == 0) { struct ip *const ip = mtod(m, struct ip *); struct inpcb *inp; INP_INFO_WLOCK(&divcbinfo); inp = sotoinpcb(so); INP_LOCK(inp); /* * Don't allow both user specified and setsockopt options, * and don't allow packet length sizes that will crash */ if (((ip->ip_hl != (sizeof (*ip) >> 2)) && inp->inp_options) || ((u_short)ntohs(ip->ip_len) > m->m_pkthdr.len)) { error = EINVAL; m_freem(m); } else { /* Convert fields to host order for ip_output() */ ip->ip_len = ntohs(ip->ip_len); ip->ip_off = ntohs(ip->ip_off); /* Send packet to output processing */ ipstat.ips_rawout++; /* XXX */ #ifdef MAC mac_create_mbuf_from_inpcb(inp, m); #endif error = ip_output(m, inp->inp_options, NULL, (so->so_options & SO_DONTROUTE) | IP_ALLOWBROADCAST | IP_RAWOUTPUT, inp->inp_moptions, NULL); } INP_UNLOCK(inp); INP_INFO_WUNLOCK(&divcbinfo); } else { if (m->m_pkthdr.rcvif == NULL) { /* * No luck with the name, check by IP address. * Clear the port and the ifname to make sure * there are no distractions for ifa_ifwithaddr. */ struct ifaddr *ifa; bzero(sin->sin_zero, sizeof(sin->sin_zero)); sin->sin_port = 0; ifa = ifa_ifwithaddr((struct sockaddr *) sin); if (ifa == NULL) { error = EADDRNOTAVAIL; goto cantsend; } m->m_pkthdr.rcvif = ifa->ifa_ifp; } #ifdef MAC SOCK_LOCK(so); mac_create_mbuf_from_socket(so, m); SOCK_UNLOCK(so); #endif /* Send packet to input processing */ ip_input(m); } return error; cantsend: m_freem(m); return error; } /* * Return a copy of the specified packet, but without * the divert tag. This is used when packets are ``tee'd'' * and we want the cloned copy to not have divert processing. */ struct mbuf * divert_clone(struct mbuf *m) { struct mbuf *clone; struct m_tag *mtag; clone = m_dup(m, M_DONTWAIT); if (clone != NULL) { /* strip divert tag from copy */ mtag = m_tag_find(clone, PACKET_TAG_DIVERT, NULL); if (mtag != NULL) m_tag_delete(clone, mtag); } return clone; } static int div_attach(struct socket *so, int proto, struct thread *td) { struct inpcb *inp; int error; INP_INFO_WLOCK(&divcbinfo); inp = sotoinpcb(so); if (inp != 0) { INP_INFO_WUNLOCK(&divcbinfo); return EINVAL; } if (td && (error = suser(td)) != 0) { INP_INFO_WUNLOCK(&divcbinfo); return error; } error = soreserve(so, div_sendspace, div_recvspace); if (error) { INP_INFO_WUNLOCK(&divcbinfo); return error; } error = in_pcballoc(so, &divcbinfo, "divinp"); if (error) { INP_INFO_WUNLOCK(&divcbinfo); return error; } inp = (struct inpcb *)so->so_pcb; INP_LOCK(inp); INP_INFO_WUNLOCK(&divcbinfo); inp->inp_ip_p = proto; inp->inp_vflag |= INP_IPV4; inp->inp_flags |= INP_HDRINCL; /* The socket is always "connected" because we always know "where" to send the packet */ INP_UNLOCK(inp); SOCK_LOCK(so); so->so_state |= SS_ISCONNECTED; SOCK_UNLOCK(so); return 0; } static int div_detach(struct socket *so) { struct inpcb *inp; INP_INFO_WLOCK(&divcbinfo); inp = sotoinpcb(so); if (inp == 0) { INP_INFO_WUNLOCK(&divcbinfo); return EINVAL; } INP_LOCK(inp); in_pcbdetach(inp); INP_INFO_WUNLOCK(&divcbinfo); return 0; } static int div_abort(struct socket *so) { struct inpcb *inp; INP_INFO_WLOCK(&divcbinfo); inp = sotoinpcb(so); if (inp == 0) { INP_INFO_WUNLOCK(&divcbinfo); return EINVAL; /* ??? possible? panic instead? */ } INP_LOCK(inp); soisdisconnected(so); in_pcbdetach(inp); INP_INFO_WUNLOCK(&divcbinfo); return 0; } static int div_disconnect(struct socket *so) { if ((so->so_state & SS_ISCONNECTED) == 0) return ENOTCONN; return div_abort(so); } static int div_bind(struct socket *so, struct sockaddr *nam, struct thread *td) { struct inpcb *inp; int error; INP_INFO_WLOCK(&divcbinfo); inp = sotoinpcb(so); if (inp == 0) { INP_INFO_WUNLOCK(&divcbinfo); return EINVAL; } /* in_pcbbind assumes that nam is a sockaddr_in * and in_pcbbind requires a valid address. Since divert * sockets don't we need to make sure the address is * filled in properly. * XXX -- divert should not be abusing in_pcbind * and should probably have its own family. */ if (nam->sa_family != AF_INET) error = EAFNOSUPPORT; else { ((struct sockaddr_in *)nam)->sin_addr.s_addr = INADDR_ANY; INP_LOCK(inp); error = in_pcbbind(inp, nam, td->td_ucred); INP_UNLOCK(inp); } INP_INFO_WUNLOCK(&divcbinfo); return error; } static int div_shutdown(struct socket *so) { struct inpcb *inp; INP_INFO_RLOCK(&divcbinfo); inp = sotoinpcb(so); if (inp == 0) { INP_INFO_RUNLOCK(&divcbinfo); return EINVAL; } INP_LOCK(inp); INP_INFO_RUNLOCK(&divcbinfo); socantsendmore(so); INP_UNLOCK(inp); return 0; } static int div_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, struct mbuf *control, struct thread *td) { /* Packet must have a header (but that's about it) */ if (m->m_len < sizeof (struct ip) && (m = m_pullup(m, sizeof (struct ip))) == 0) { ipstat.ips_toosmall++; m_freem(m); return EINVAL; } /* Send packet */ return div_output(so, m, (struct sockaddr_in *)nam, control); } void div_ctlinput(int cmd, struct sockaddr *sa, void *vip) { struct in_addr faddr; faddr = ((struct sockaddr_in *)sa)->sin_addr; if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY) return; if (PRC_IS_REDIRECT(cmd)) return; } static int div_pcblist(SYSCTL_HANDLER_ARGS) { int error, i, n; struct inpcb *inp, **inp_list; inp_gen_t gencnt; struct xinpgen xig; /* * The process of preparing the TCB list is too time-consuming and * resource-intensive to repeat twice on every request. */ if (req->oldptr == 0) { n = divcbinfo.ipi_count; req->oldidx = 2 * (sizeof xig) + (n + n/8) * sizeof(struct xinpcb); return 0; } if (req->newptr != 0) return EPERM; /* * OK, now we're committed to doing something. */ INP_INFO_RLOCK(&divcbinfo); gencnt = divcbinfo.ipi_gencnt; n = divcbinfo.ipi_count; INP_INFO_RUNLOCK(&divcbinfo); error = sysctl_wire_old_buffer(req, 2 * sizeof(xig) + n*sizeof(struct xinpcb)); if (error != 0) return (error); xig.xig_len = sizeof xig; xig.xig_count = n; xig.xig_gen = gencnt; xig.xig_sogen = so_gencnt; error = SYSCTL_OUT(req, &xig, sizeof xig); if (error) return error; inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK); if (inp_list == 0) return ENOMEM; INP_INFO_RLOCK(&divcbinfo); for (inp = LIST_FIRST(divcbinfo.listhead), i = 0; inp && i < n; inp = LIST_NEXT(inp, inp_list)) { INP_LOCK(inp); if (inp->inp_gencnt <= gencnt && cr_canseesocket(req->td->td_ucred, inp->inp_socket) == 0) inp_list[i++] = inp; INP_UNLOCK(inp); } INP_INFO_RUNLOCK(&divcbinfo); n = i; error = 0; for (i = 0; i < n; i++) { inp = inp_list[i]; if (inp->inp_gencnt <= gencnt) { struct xinpcb xi; xi.xi_len = sizeof xi; /* XXX should avoid extra copy */ bcopy(inp, &xi.xi_inp, sizeof *inp); if (inp->inp_socket) sotoxsocket(inp->inp_socket, &xi.xi_socket); error = SYSCTL_OUT(req, &xi, sizeof xi); } } if (!error) { /* * Give the user an updated idea of our state. * If the generation differs from what we told * her before, she knows that something happened * while we were processing this request, and it * might be necessary to retry. */ INP_INFO_RLOCK(&divcbinfo); xig.xig_gen = divcbinfo.ipi_gencnt; xig.xig_sogen = so_gencnt; xig.xig_count = divcbinfo.ipi_count; INP_INFO_RUNLOCK(&divcbinfo); error = SYSCTL_OUT(req, &xig, sizeof xig); } free(inp_list, M_TEMP); return error; } /* * This is the wrapper function for in_setsockaddr. We just pass down * the pcbinfo for in_setpeeraddr to lock. */ static int div_sockaddr(struct socket *so, struct sockaddr **nam) { return (in_setsockaddr(so, nam, &divcbinfo)); } /* * This is the wrapper function for in_setpeeraddr. We just pass down * the pcbinfo for in_setpeeraddr to lock. */ static int div_peeraddr(struct socket *so, struct sockaddr **nam) { return (in_setpeeraddr(so, nam, &divcbinfo)); } SYSCTL_DECL(_net_inet_divert); SYSCTL_PROC(_net_inet_divert, OID_AUTO, pcblist, CTLFLAG_RD, 0, 0, div_pcblist, "S,xinpcb", "List of active divert sockets"); struct pr_usrreqs div_usrreqs = { div_abort, pru_accept_notsupp, div_attach, div_bind, pru_connect_notsupp, pru_connect2_notsupp, in_control, div_detach, div_disconnect, pru_listen_notsupp, div_peeraddr, pru_rcvd_notsupp, pru_rcvoob_notsupp, div_send, pru_sense_null, div_shutdown, div_sockaddr, sosend, soreceive, sopoll, in_pcbsosetlabel }; Index: head/sys/netinet/ip_mroute.c =================================================================== --- head/sys/netinet/ip_mroute.c (revision 131150) +++ head/sys/netinet/ip_mroute.c (revision 131151) @@ -1,3430 +1,3433 @@ /* * IP multicast forwarding procedures * * Written by David Waitzman, BBN Labs, August 1988. * Modified by Steve Deering, Stanford, February 1989. * Modified by Mark J. Steiglitz, Stanford, May, 1991 * Modified by Van Jacobson, LBL, January 1993 * Modified by Ajit Thyagarajan, PARC, August 1993 * Modified by Bill Fenner, PARC, April 1995 * Modified by Ahmed Helmy, SGI, June 1996 * Modified by George Edmond Eddy (Rusty), ISI, February 1998 * Modified by Pavlin Radoslavov, USC/ISI, May 1998, August 1999, October 2000 * Modified by Hitoshi Asaeda, WIDE, August 2000 * Modified by Pavlin Radoslavov, ICSI, October 2002 * * MROUTING Revision: 3.5 * and PIM-SMv2 and PIM-DM support, advanced API support, * bandwidth metering and signaling * * $FreeBSD$ */ #include "opt_mac.h" #include "opt_mrouting.h" #include "opt_random_ip_id.h" #ifdef PIM #define _PIM_VT 1 #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef PIM #include #include #endif #include #include /* * Control debugging code for rsvp and multicast routing code. * Can only set them with the debugger. */ static u_int rsvpdebug; /* non-zero enables debugging */ static u_int mrtdebug; /* any set of the flags below */ #define DEBUG_MFC 0x02 #define DEBUG_FORWARD 0x04 #define DEBUG_EXPIRE 0x08 #define DEBUG_XMIT 0x10 #define DEBUG_PIM 0x20 #define VIFI_INVALID ((vifi_t) -1) #define M_HASCL(m) ((m)->m_flags & M_EXT) static MALLOC_DEFINE(M_MRTABLE, "mroutetbl", "multicast routing tables"); /* * Locking. We use two locks: one for the virtual interface table and * one for the forwarding table. These locks may be nested in which case * the VIF lock must always be taken first. Note that each lock is used * to cover not only the specific data structure but also related data * structures. It may be better to add more fine-grained locking later; * it's not clear how performance-critical this code is. */ static struct mrtstat mrtstat; SYSCTL_STRUCT(_net_inet_ip, OID_AUTO, mrtstat, CTLFLAG_RW, &mrtstat, mrtstat, "Multicast Routing Statistics (struct mrtstat, netinet/ip_mroute.h)"); static struct mfc *mfctable[MFCTBLSIZ]; SYSCTL_OPAQUE(_net_inet_ip, OID_AUTO, mfctable, CTLFLAG_RD, &mfctable, sizeof(mfctable), "S,*mfc[MFCTBLSIZ]", "Multicast Forwarding Table (struct *mfc[MFCTBLSIZ], netinet/ip_mroute.h)"); static struct mtx mfc_mtx; #define MFC_LOCK() mtx_lock(&mfc_mtx) #define MFC_UNLOCK() mtx_unlock(&mfc_mtx) #define MFC_LOCK_ASSERT() do { \ mtx_assert(&mfc_mtx, MA_OWNED); \ NET_ASSERT_GIANT(); \ } while (0) #define MFC_LOCK_INIT() mtx_init(&mfc_mtx, "mroute mfc table", NULL, MTX_DEF) #define MFC_LOCK_DESTROY() mtx_destroy(&mfc_mtx) static struct vif viftable[MAXVIFS]; SYSCTL_OPAQUE(_net_inet_ip, OID_AUTO, viftable, CTLFLAG_RD, &viftable, sizeof(viftable), "S,vif[MAXVIFS]", "Multicast Virtual Interfaces (struct vif[MAXVIFS], netinet/ip_mroute.h)"); static struct mtx vif_mtx; #define VIF_LOCK() mtx_lock(&vif_mtx) #define VIF_UNLOCK() mtx_unlock(&vif_mtx) #define VIF_LOCK_ASSERT() mtx_assert(&vif_mtx, MA_OWNED) #define VIF_LOCK_INIT() mtx_init(&vif_mtx, "mroute vif table", NULL, MTX_DEF) #define VIF_LOCK_DESTROY() mtx_destroy(&vif_mtx) static u_char nexpire[MFCTBLSIZ]; static struct callout expire_upcalls_ch; #define EXPIRE_TIMEOUT (hz / 4) /* 4x / second */ #define UPCALL_EXPIRE 6 /* number of timeouts */ /* * Define the token bucket filter structures * tbftable -> each vif has one of these for storing info */ static struct tbf tbftable[MAXVIFS]; #define TBF_REPROCESS (hz / 100) /* 100x / second */ /* * 'Interfaces' associated with decapsulator (so we can tell * packets that went through it from ones that get reflected * by a broken gateway). These interfaces are never linked into * the system ifnet list & no routes point to them. I.e., packets * can't be sent this way. They only exist as a placeholder for * multicast source verification. */ static struct ifnet multicast_decap_if[MAXVIFS]; #define ENCAP_TTL 64 #define ENCAP_PROTO IPPROTO_IPIP /* 4 */ /* prototype IP hdr for encapsulated packets */ static struct ip multicast_encap_iphdr = { #if BYTE_ORDER == LITTLE_ENDIAN sizeof(struct ip) >> 2, IPVERSION, #else IPVERSION, sizeof(struct ip) >> 2, #endif 0, /* tos */ sizeof(struct ip), /* total length */ 0, /* id */ 0, /* frag offset */ ENCAP_TTL, ENCAP_PROTO, 0, /* checksum */ }; /* * Bandwidth meter variables and constants */ static MALLOC_DEFINE(M_BWMETER, "bwmeter", "multicast upcall bw meters"); /* * Pending timeouts are stored in a hash table, the key being the * expiration time. Periodically, the entries are analysed and processed. */ #define BW_METER_BUCKETS 1024 static struct bw_meter *bw_meter_timers[BW_METER_BUCKETS]; static struct callout bw_meter_ch; #define BW_METER_PERIOD (hz) /* periodical handling of bw meters */ /* * Pending upcalls are stored in a vector which is flushed when * full, or periodically */ static struct bw_upcall bw_upcalls[BW_UPCALLS_MAX]; static u_int bw_upcalls_n; /* # of pending upcalls */ static struct callout bw_upcalls_ch; #define BW_UPCALLS_PERIOD (hz) /* periodical flush of bw upcalls */ #ifdef PIM static struct pimstat pimstat; SYSCTL_STRUCT(_net_inet_pim, PIMCTL_STATS, stats, CTLFLAG_RD, &pimstat, pimstat, "PIM Statistics (struct pimstat, netinet/pim_var.h)"); /* * Note: the PIM Register encapsulation adds the following in front of a * data packet: * * struct pim_encap_hdr { * struct ip ip; * struct pim_encap_pimhdr pim; * } * */ struct pim_encap_pimhdr { struct pim pim; uint32_t flags; }; static struct ip pim_encap_iphdr = { #if BYTE_ORDER == LITTLE_ENDIAN sizeof(struct ip) >> 2, IPVERSION, #else IPVERSION, sizeof(struct ip) >> 2, #endif 0, /* tos */ sizeof(struct ip), /* total length */ 0, /* id */ 0, /* frag offset */ ENCAP_TTL, IPPROTO_PIM, 0, /* checksum */ }; static struct pim_encap_pimhdr pim_encap_pimhdr = { { PIM_MAKE_VT(PIM_VERSION, PIM_REGISTER), /* PIM vers and message type */ 0, /* reserved */ 0, /* checksum */ }, 0 /* flags */ }; static struct ifnet multicast_register_if; static vifi_t reg_vif_num = VIFI_INVALID; #endif /* PIM */ /* * Private variables. */ static vifi_t numvifs; static const struct encaptab *encap_cookie; /* * one-back cache used by mroute_encapcheck to locate a tunnel's vif * given a datagram's src ip address. */ static u_long last_encap_src; static struct vif *last_encap_vif; /* * Callout for queue processing. */ static struct callout tbf_reprocess_ch; static u_long X_ip_mcast_src(int vifi); static int X_ip_mforward(struct ip *ip, struct ifnet *ifp, struct mbuf *m, struct ip_moptions *imo); static int X_ip_mrouter_done(void); static int X_ip_mrouter_get(struct socket *so, struct sockopt *m); static int X_ip_mrouter_set(struct socket *so, struct sockopt *m); static int X_legal_vif_num(int vif); static int X_mrt_ioctl(int cmd, caddr_t data); static int get_sg_cnt(struct sioc_sg_req *); static int get_vif_cnt(struct sioc_vif_req *); static int ip_mrouter_init(struct socket *, int); static int add_vif(struct vifctl *); static int del_vif(vifi_t); static int add_mfc(struct mfcctl2 *); static int del_mfc(struct mfcctl2 *); static int set_api_config(uint32_t *); /* chose API capabilities */ static int socket_send(struct socket *, struct mbuf *, struct sockaddr_in *); static int set_assert(int); static void expire_upcalls(void *); static int ip_mdq(struct mbuf *, struct ifnet *, struct mfc *, vifi_t); static void phyint_send(struct ip *, struct vif *, struct mbuf *); static void encap_send(struct ip *, struct vif *, struct mbuf *); static void tbf_control(struct vif *, struct mbuf *, struct ip *, u_long); static void tbf_queue(struct vif *, struct mbuf *); static void tbf_process_q(struct vif *); static void tbf_reprocess_q(void *); static int tbf_dq_sel(struct vif *, struct ip *); static void tbf_send_packet(struct vif *, struct mbuf *); static void tbf_update_tokens(struct vif *); static int priority(struct vif *, struct ip *); /* * Bandwidth monitoring */ static void free_bw_list(struct bw_meter *list); static int add_bw_upcall(struct bw_upcall *); static int del_bw_upcall(struct bw_upcall *); static void bw_meter_receive_packet(struct bw_meter *x, int plen, struct timeval *nowp); static void bw_meter_prepare_upcall(struct bw_meter *x, struct timeval *nowp); static void bw_upcalls_send(void); static void schedule_bw_meter(struct bw_meter *x, struct timeval *nowp); static void unschedule_bw_meter(struct bw_meter *x); static void bw_meter_process(void); static void expire_bw_upcalls_send(void *); static void expire_bw_meter_process(void *); #ifdef PIM static int pim_register_send(struct ip *, struct vif *, struct mbuf *, struct mfc *); static int pim_register_send_rp(struct ip *, struct vif *, struct mbuf *, struct mfc *); static int pim_register_send_upcall(struct ip *, struct vif *, struct mbuf *, struct mfc *); static struct mbuf *pim_register_prepare(struct ip *, struct mbuf *); #endif /* * whether or not special PIM assert processing is enabled. */ static int pim_assert; /* * Rate limit for assert notification messages, in usec */ #define ASSERT_MSG_TIME 3000000 /* * Kernel multicast routing API capabilities and setup. * If more API capabilities are added to the kernel, they should be * recorded in `mrt_api_support'. */ static const uint32_t mrt_api_support = (MRT_MFC_FLAGS_DISABLE_WRONGVIF | MRT_MFC_FLAGS_BORDER_VIF | MRT_MFC_RP | MRT_MFC_BW_UPCALL); static uint32_t mrt_api_config = 0; /* * Hash function for a source, group entry */ #define MFCHASH(a, g) MFCHASHMOD(((a) >> 20) ^ ((a) >> 10) ^ (a) ^ \ ((g) >> 20) ^ ((g) >> 10) ^ (g)) /* * Find a route for a given origin IP address and Multicast group address * Type of service parameter to be added in the future!!! * Statistics are updated by the caller if needed * (mrtstat.mrts_mfc_lookups and mrtstat.mrts_mfc_misses) */ static struct mfc * mfc_find(in_addr_t o, in_addr_t g) { struct mfc *rt; MFC_LOCK_ASSERT(); for (rt = mfctable[MFCHASH(o,g)]; rt; rt = rt->mfc_next) if ((rt->mfc_origin.s_addr == o) && (rt->mfc_mcastgrp.s_addr == g) && (rt->mfc_stall == NULL)) break; return rt; } /* * Macros to compute elapsed time efficiently * Borrowed from Van Jacobson's scheduling code */ #define TV_DELTA(a, b, delta) { \ int xxs; \ delta = (a).tv_usec - (b).tv_usec; \ if ((xxs = (a).tv_sec - (b).tv_sec)) { \ switch (xxs) { \ case 2: \ delta += 1000000; \ /* FALLTHROUGH */ \ case 1: \ delta += 1000000; \ break; \ default: \ delta += (1000000 * xxs); \ } \ } \ } #define TV_LT(a, b) (((a).tv_usec < (b).tv_usec && \ (a).tv_sec <= (b).tv_sec) || (a).tv_sec < (b).tv_sec) /* * Handle MRT setsockopt commands to modify the multicast routing tables. */ static int X_ip_mrouter_set(struct socket *so, struct sockopt *sopt) { int error, optval; vifi_t vifi; struct vifctl vifc; struct mfcctl2 mfc; struct bw_upcall bw_upcall; uint32_t i; if (so != ip_mrouter && sopt->sopt_name != MRT_INIT) return EPERM; error = 0; switch (sopt->sopt_name) { case MRT_INIT: error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) break; error = ip_mrouter_init(so, optval); break; case MRT_DONE: error = ip_mrouter_done(); break; case MRT_ADD_VIF: error = sooptcopyin(sopt, &vifc, sizeof vifc, sizeof vifc); if (error) break; error = add_vif(&vifc); break; case MRT_DEL_VIF: error = sooptcopyin(sopt, &vifi, sizeof vifi, sizeof vifi); if (error) break; error = del_vif(vifi); break; case MRT_ADD_MFC: case MRT_DEL_MFC: /* * select data size depending on API version. */ if (sopt->sopt_name == MRT_ADD_MFC && mrt_api_config & MRT_API_FLAGS_ALL) { error = sooptcopyin(sopt, &mfc, sizeof(struct mfcctl2), sizeof(struct mfcctl2)); } else { error = sooptcopyin(sopt, &mfc, sizeof(struct mfcctl), sizeof(struct mfcctl)); bzero((caddr_t)&mfc + sizeof(struct mfcctl), sizeof(mfc) - sizeof(struct mfcctl)); } if (error) break; if (sopt->sopt_name == MRT_ADD_MFC) error = add_mfc(&mfc); else error = del_mfc(&mfc); break; case MRT_ASSERT: error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) break; set_assert(optval); break; case MRT_API_CONFIG: error = sooptcopyin(sopt, &i, sizeof i, sizeof i); if (!error) error = set_api_config(&i); if (!error) error = sooptcopyout(sopt, &i, sizeof i); break; case MRT_ADD_BW_UPCALL: case MRT_DEL_BW_UPCALL: error = sooptcopyin(sopt, &bw_upcall, sizeof bw_upcall, sizeof bw_upcall); if (error) break; if (sopt->sopt_name == MRT_ADD_BW_UPCALL) error = add_bw_upcall(&bw_upcall); else error = del_bw_upcall(&bw_upcall); break; default: error = EOPNOTSUPP; break; } return error; } /* * Handle MRT getsockopt commands */ static int X_ip_mrouter_get(struct socket *so, struct sockopt *sopt) { int error; static int version = 0x0305; /* !!! why is this here? XXX */ switch (sopt->sopt_name) { case MRT_VERSION: error = sooptcopyout(sopt, &version, sizeof version); break; case MRT_ASSERT: error = sooptcopyout(sopt, &pim_assert, sizeof pim_assert); break; case MRT_API_SUPPORT: error = sooptcopyout(sopt, &mrt_api_support, sizeof mrt_api_support); break; case MRT_API_CONFIG: error = sooptcopyout(sopt, &mrt_api_config, sizeof mrt_api_config); break; default: error = EOPNOTSUPP; break; } return error; } /* * Handle ioctl commands to obtain information from the cache */ static int X_mrt_ioctl(int cmd, caddr_t data) { int error = 0; switch (cmd) { case (SIOCGETVIFCNT): error = get_vif_cnt((struct sioc_vif_req *)data); break; case (SIOCGETSGCNT): error = get_sg_cnt((struct sioc_sg_req *)data); break; default: error = EINVAL; break; } return error; } /* * returns the packet, byte, rpf-failure count for the source group provided */ static int get_sg_cnt(struct sioc_sg_req *req) { struct mfc *rt; MFC_LOCK(); rt = mfc_find(req->src.s_addr, req->grp.s_addr); if (rt == NULL) { MFC_UNLOCK(); req->pktcnt = req->bytecnt = req->wrong_if = 0xffffffff; return EADDRNOTAVAIL; } req->pktcnt = rt->mfc_pkt_cnt; req->bytecnt = rt->mfc_byte_cnt; req->wrong_if = rt->mfc_wrong_if; MFC_UNLOCK(); return 0; } /* * returns the input and output packet and byte counts on the vif provided */ static int get_vif_cnt(struct sioc_vif_req *req) { vifi_t vifi = req->vifi; VIF_LOCK(); if (vifi >= numvifs) { VIF_UNLOCK(); return EINVAL; } req->icount = viftable[vifi].v_pkt_in; req->ocount = viftable[vifi].v_pkt_out; req->ibytes = viftable[vifi].v_bytes_in; req->obytes = viftable[vifi].v_bytes_out; VIF_UNLOCK(); return 0; } static void ip_mrouter_reset(void) { bzero((caddr_t)mfctable, sizeof(mfctable)); bzero((caddr_t)nexpire, sizeof(nexpire)); pim_assert = 0; mrt_api_config = 0; callout_init(&expire_upcalls_ch, CALLOUT_MPSAFE); bw_upcalls_n = 0; bzero((caddr_t)bw_meter_timers, sizeof(bw_meter_timers)); callout_init(&bw_upcalls_ch, CALLOUT_MPSAFE); callout_init(&bw_meter_ch, CALLOUT_MPSAFE); callout_init(&tbf_reprocess_ch, CALLOUT_MPSAFE); } static struct mtx mrouter_mtx; /* used to synch init/done work */ /* * Enable multicast routing */ static int ip_mrouter_init(struct socket *so, int version) { if (mrtdebug) log(LOG_DEBUG, "ip_mrouter_init: so_type = %d, pr_protocol = %d\n", so->so_type, so->so_proto->pr_protocol); if (so->so_type != SOCK_RAW || so->so_proto->pr_protocol != IPPROTO_IGMP) return EOPNOTSUPP; if (version != 1) return ENOPROTOOPT; mtx_lock(&mrouter_mtx); if (ip_mrouter != NULL) { mtx_unlock(&mrouter_mtx); return EADDRINUSE; } callout_reset(&expire_upcalls_ch, EXPIRE_TIMEOUT, expire_upcalls, NULL); callout_reset(&bw_upcalls_ch, BW_UPCALLS_PERIOD, expire_bw_upcalls_send, NULL); callout_reset(&bw_meter_ch, BW_METER_PERIOD, expire_bw_meter_process, NULL); ip_mrouter = so; mtx_unlock(&mrouter_mtx); if (mrtdebug) log(LOG_DEBUG, "ip_mrouter_init\n"); return 0; } /* * Disable multicast routing */ static int X_ip_mrouter_done(void) { vifi_t vifi; int i; struct ifnet *ifp; struct ifreq ifr; struct mfc *rt; struct rtdetq *rte; mtx_lock(&mrouter_mtx); if (ip_mrouter == NULL) { mtx_unlock(&mrouter_mtx); return EINVAL; } /* * Detach/disable hooks to the reset of the system. */ ip_mrouter = NULL; mrt_api_config = 0; VIF_LOCK(); if (encap_cookie) { const struct encaptab *c = encap_cookie; encap_cookie = NULL; encap_detach(c); } VIF_UNLOCK(); callout_stop(&tbf_reprocess_ch); VIF_LOCK(); /* * For each phyint in use, disable promiscuous reception of all IP * multicasts. */ for (vifi = 0; vifi < numvifs; vifi++) { if (viftable[vifi].v_lcl_addr.s_addr != 0 && !(viftable[vifi].v_flags & (VIFF_TUNNEL | VIFF_REGISTER))) { struct sockaddr_in *so = (struct sockaddr_in *)&(ifr.ifr_addr); so->sin_len = sizeof(struct sockaddr_in); so->sin_family = AF_INET; so->sin_addr.s_addr = INADDR_ANY; ifp = viftable[vifi].v_ifp; if_allmulti(ifp, 0); } } bzero((caddr_t)tbftable, sizeof(tbftable)); bzero((caddr_t)viftable, sizeof(viftable)); numvifs = 0; pim_assert = 0; VIF_UNLOCK(); /* * Free all multicast forwarding cache entries. */ callout_stop(&expire_upcalls_ch); callout_stop(&bw_upcalls_ch); callout_stop(&bw_meter_ch); MFC_LOCK(); for (i = 0; i < MFCTBLSIZ; i++) { for (rt = mfctable[i]; rt != NULL; ) { struct mfc *nr = rt->mfc_next; for (rte = rt->mfc_stall; rte != NULL; ) { struct rtdetq *n = rte->next; m_freem(rte->m); free(rte, M_MRTABLE); rte = n; } free_bw_list(rt->mfc_bw_meter); free(rt, M_MRTABLE); rt = nr; } } bzero((caddr_t)mfctable, sizeof(mfctable)); bzero((caddr_t)nexpire, sizeof(nexpire)); bw_upcalls_n = 0; bzero(bw_meter_timers, sizeof(bw_meter_timers)); MFC_UNLOCK(); /* * Reset de-encapsulation cache */ last_encap_src = INADDR_ANY; last_encap_vif = NULL; #ifdef PIM reg_vif_num = VIFI_INVALID; #endif mtx_unlock(&mrouter_mtx); if (mrtdebug) log(LOG_DEBUG, "ip_mrouter_done\n"); return 0; } /* * Set PIM assert processing global */ static int set_assert(int i) { if ((i != 1) && (i != 0)) return EINVAL; pim_assert = i; return 0; } /* * Configure API capabilities */ int set_api_config(uint32_t *apival) { int i; /* * We can set the API capabilities only if it is the first operation * after MRT_INIT. I.e.: * - there are no vifs installed * - pim_assert is not enabled * - the MFC table is empty */ if (numvifs > 0) { *apival = 0; return EPERM; } if (pim_assert) { *apival = 0; return EPERM; } for (i = 0; i < MFCTBLSIZ; i++) { if (mfctable[i] != NULL) { *apival = 0; return EPERM; } } mrt_api_config = *apival & mrt_api_support; *apival = mrt_api_config; return 0; } /* * Decide if a packet is from a tunnelled peer. * Return 0 if not, 64 if so. XXX yuck.. 64 ??? */ static int mroute_encapcheck(const struct mbuf *m, int off, int proto, void *arg) { struct ip *ip = mtod(m, struct ip *); int hlen = ip->ip_hl << 2; /* * don't claim the packet if it's not to a multicast destination or if * we don't have an encapsulating tunnel with the source. * Note: This code assumes that the remote site IP address * uniquely identifies the tunnel (i.e., that this site has * at most one tunnel with the remote site). */ if (!IN_MULTICAST(ntohl(((struct ip *)((char *)ip+hlen))->ip_dst.s_addr))) return 0; if (ip->ip_src.s_addr != last_encap_src) { struct vif *vifp = viftable; struct vif *vife = vifp + numvifs; last_encap_src = ip->ip_src.s_addr; last_encap_vif = NULL; for ( ; vifp < vife; ++vifp) if (vifp->v_rmt_addr.s_addr == ip->ip_src.s_addr) { if ((vifp->v_flags & (VIFF_TUNNEL|VIFF_SRCRT)) == VIFF_TUNNEL) last_encap_vif = vifp; break; } } if (last_encap_vif == NULL) { last_encap_src = INADDR_ANY; return 0; } return 64; } /* * De-encapsulate a packet and feed it back through ip input (this * routine is called whenever IP gets a packet that mroute_encap_func() * claimed). */ static void mroute_encap_input(struct mbuf *m, int off) { struct ip *ip = mtod(m, struct ip *); int hlen = ip->ip_hl << 2; if (hlen > sizeof(struct ip)) ip_stripoptions(m, (struct mbuf *) 0); m->m_data += sizeof(struct ip); m->m_len -= sizeof(struct ip); m->m_pkthdr.len -= sizeof(struct ip); m->m_pkthdr.rcvif = last_encap_vif->v_ifp; netisr_queue(NETISR_IP, m); /* * normally we would need a "schednetisr(NETISR_IP)" * here but we were called by ip_input and it is going * to loop back & try to dequeue the packet we just * queued as soon as we return so we avoid the * unnecessary software interrrupt. * * XXX * This no longer holds - we may have direct-dispatched the packet, * or there may be a queue processing limit. */ } extern struct domain inetdomain; static struct protosw mroute_encap_protosw = { SOCK_RAW, &inetdomain, IPPROTO_IPV4, PR_ATOMIC|PR_ADDR, mroute_encap_input, 0, 0, rip_ctloutput, 0, 0, 0, 0, 0, &rip_usrreqs }; /* * Add a vif to the vif table */ static int add_vif(struct vifctl *vifcp) { struct vif *vifp = viftable + vifcp->vifc_vifi; struct sockaddr_in sin = {sizeof sin, AF_INET}; struct ifaddr *ifa; struct ifnet *ifp; int error; struct tbf *v_tbf = tbftable + vifcp->vifc_vifi; VIF_LOCK(); if (vifcp->vifc_vifi >= MAXVIFS) { VIF_UNLOCK(); return EINVAL; } if (vifp->v_lcl_addr.s_addr != INADDR_ANY) { VIF_UNLOCK(); return EADDRINUSE; } if (vifcp->vifc_lcl_addr.s_addr == INADDR_ANY) { VIF_UNLOCK(); return EADDRNOTAVAIL; } /* Find the interface with an address in AF_INET family */ #ifdef PIM if (vifcp->vifc_flags & VIFF_REGISTER) { /* * XXX: Because VIFF_REGISTER does not really need a valid * local interface (e.g. it could be 127.0.0.2), we don't * check its address. */ ifp = NULL; } else #endif { sin.sin_addr = vifcp->vifc_lcl_addr; ifa = ifa_ifwithaddr((struct sockaddr *)&sin); if (ifa == NULL) { VIF_UNLOCK(); return EADDRNOTAVAIL; } ifp = ifa->ifa_ifp; } if (vifcp->vifc_flags & VIFF_TUNNEL) { if ((vifcp->vifc_flags & VIFF_SRCRT) == 0) { /* * An encapsulating tunnel is wanted. Tell * mroute_encap_input() to start paying attention * to encapsulated packets. */ if (encap_cookie == NULL) { int i; encap_cookie = encap_attach_func(AF_INET, IPPROTO_IPV4, mroute_encapcheck, (struct protosw *)&mroute_encap_protosw, NULL); if (encap_cookie == NULL) { printf("ip_mroute: unable to attach encap\n"); VIF_UNLOCK(); return EIO; /* XXX */ } for (i = 0; i < MAXVIFS; ++i) { if_initname(&multicast_decap_if[i], "mdecap", i); } } /* * Set interface to fake encapsulator interface */ ifp = &multicast_decap_if[vifcp->vifc_vifi]; /* * Prepare cached route entry */ bzero(&vifp->v_route, sizeof(vifp->v_route)); } else { log(LOG_ERR, "source routed tunnels not supported\n"); VIF_UNLOCK(); return EOPNOTSUPP; } #ifdef PIM } else if (vifcp->vifc_flags & VIFF_REGISTER) { ifp = &multicast_register_if; if (mrtdebug) log(LOG_DEBUG, "Adding a register vif, ifp: %p\n", (void *)&multicast_register_if); if (reg_vif_num == VIFI_INVALID) { if_initname(&multicast_register_if, "register_vif", 0); multicast_register_if.if_flags = IFF_LOOPBACK; bzero(&vifp->v_route, sizeof(vifp->v_route)); reg_vif_num = vifcp->vifc_vifi; } #endif } else { /* Make sure the interface supports multicast */ if ((ifp->if_flags & IFF_MULTICAST) == 0) { VIF_UNLOCK(); return EOPNOTSUPP; } /* Enable promiscuous reception of all IP multicasts from the if */ error = if_allmulti(ifp, 1); if (error) { VIF_UNLOCK(); return error; } } /* define parameters for the tbf structure */ vifp->v_tbf = v_tbf; GET_TIME(vifp->v_tbf->tbf_last_pkt_t); vifp->v_tbf->tbf_n_tok = 0; vifp->v_tbf->tbf_q_len = 0; vifp->v_tbf->tbf_max_q_len = MAXQSIZE; vifp->v_tbf->tbf_q = vifp->v_tbf->tbf_t = NULL; vifp->v_flags = vifcp->vifc_flags; vifp->v_threshold = vifcp->vifc_threshold; vifp->v_lcl_addr = vifcp->vifc_lcl_addr; vifp->v_rmt_addr = vifcp->vifc_rmt_addr; vifp->v_ifp = ifp; /* scaling up here allows division by 1024 in critical code */ vifp->v_rate_limit= vifcp->vifc_rate_limit * 1024 / 1000; vifp->v_rsvp_on = 0; vifp->v_rsvpd = NULL; /* initialize per vif pkt counters */ vifp->v_pkt_in = 0; vifp->v_pkt_out = 0; vifp->v_bytes_in = 0; vifp->v_bytes_out = 0; /* Adjust numvifs up if the vifi is higher than numvifs */ if (numvifs <= vifcp->vifc_vifi) numvifs = vifcp->vifc_vifi + 1; VIF_UNLOCK(); if (mrtdebug) log(LOG_DEBUG, "add_vif #%d, lcladdr %lx, %s %lx, thresh %x, rate %d\n", vifcp->vifc_vifi, (u_long)ntohl(vifcp->vifc_lcl_addr.s_addr), (vifcp->vifc_flags & VIFF_TUNNEL) ? "rmtaddr" : "mask", (u_long)ntohl(vifcp->vifc_rmt_addr.s_addr), vifcp->vifc_threshold, vifcp->vifc_rate_limit); return 0; } /* * Delete a vif from the vif table */ static int del_vif(vifi_t vifi) { struct vif *vifp; VIF_LOCK(); if (vifi >= numvifs) { VIF_UNLOCK(); return EINVAL; } vifp = &viftable[vifi]; if (vifp->v_lcl_addr.s_addr == INADDR_ANY) { VIF_UNLOCK(); return EADDRNOTAVAIL; } if (!(vifp->v_flags & (VIFF_TUNNEL | VIFF_REGISTER))) if_allmulti(vifp->v_ifp, 0); if (vifp == last_encap_vif) { last_encap_vif = NULL; last_encap_src = INADDR_ANY; } /* * Free packets queued at the interface */ while (vifp->v_tbf->tbf_q) { struct mbuf *m = vifp->v_tbf->tbf_q; vifp->v_tbf->tbf_q = m->m_act; m_freem(m); } #ifdef PIM if (vifp->v_flags & VIFF_REGISTER) reg_vif_num = VIFI_INVALID; #endif bzero((caddr_t)vifp->v_tbf, sizeof(*(vifp->v_tbf))); bzero((caddr_t)vifp, sizeof (*vifp)); if (mrtdebug) log(LOG_DEBUG, "del_vif %d, numvifs %d\n", vifi, numvifs); /* Adjust numvifs down */ for (vifi = numvifs; vifi > 0; vifi--) if (viftable[vifi-1].v_lcl_addr.s_addr != INADDR_ANY) break; numvifs = vifi; VIF_UNLOCK(); return 0; } /* * update an mfc entry without resetting counters and S,G addresses. */ static void update_mfc_params(struct mfc *rt, struct mfcctl2 *mfccp) { int i; rt->mfc_parent = mfccp->mfcc_parent; for (i = 0; i < numvifs; i++) { rt->mfc_ttls[i] = mfccp->mfcc_ttls[i]; rt->mfc_flags[i] = mfccp->mfcc_flags[i] & mrt_api_config & MRT_MFC_FLAGS_ALL; } /* set the RP address */ if (mrt_api_config & MRT_MFC_RP) rt->mfc_rp = mfccp->mfcc_rp; else rt->mfc_rp.s_addr = INADDR_ANY; } /* * fully initialize an mfc entry from the parameter. */ static void init_mfc_params(struct mfc *rt, struct mfcctl2 *mfccp) { rt->mfc_origin = mfccp->mfcc_origin; rt->mfc_mcastgrp = mfccp->mfcc_mcastgrp; update_mfc_params(rt, mfccp); /* initialize pkt counters per src-grp */ rt->mfc_pkt_cnt = 0; rt->mfc_byte_cnt = 0; rt->mfc_wrong_if = 0; rt->mfc_last_assert.tv_sec = rt->mfc_last_assert.tv_usec = 0; } /* * Add an mfc entry */ static int add_mfc(struct mfcctl2 *mfccp) { struct mfc *rt; u_long hash; struct rtdetq *rte; u_short nstl; VIF_LOCK(); MFC_LOCK(); rt = mfc_find(mfccp->mfcc_origin.s_addr, mfccp->mfcc_mcastgrp.s_addr); /* If an entry already exists, just update the fields */ if (rt) { if (mrtdebug & DEBUG_MFC) log(LOG_DEBUG,"add_mfc update o %lx g %lx p %x\n", (u_long)ntohl(mfccp->mfcc_origin.s_addr), (u_long)ntohl(mfccp->mfcc_mcastgrp.s_addr), mfccp->mfcc_parent); update_mfc_params(rt, mfccp); MFC_UNLOCK(); VIF_UNLOCK(); return 0; } /* * Find the entry for which the upcall was made and update */ hash = MFCHASH(mfccp->mfcc_origin.s_addr, mfccp->mfcc_mcastgrp.s_addr); for (rt = mfctable[hash], nstl = 0; rt; rt = rt->mfc_next) { if ((rt->mfc_origin.s_addr == mfccp->mfcc_origin.s_addr) && (rt->mfc_mcastgrp.s_addr == mfccp->mfcc_mcastgrp.s_addr) && (rt->mfc_stall != NULL)) { if (nstl++) log(LOG_ERR, "add_mfc %s o %lx g %lx p %x dbx %p\n", "multiple kernel entries", (u_long)ntohl(mfccp->mfcc_origin.s_addr), (u_long)ntohl(mfccp->mfcc_mcastgrp.s_addr), mfccp->mfcc_parent, (void *)rt->mfc_stall); if (mrtdebug & DEBUG_MFC) log(LOG_DEBUG,"add_mfc o %lx g %lx p %x dbg %p\n", (u_long)ntohl(mfccp->mfcc_origin.s_addr), (u_long)ntohl(mfccp->mfcc_mcastgrp.s_addr), mfccp->mfcc_parent, (void *)rt->mfc_stall); init_mfc_params(rt, mfccp); rt->mfc_expire = 0; /* Don't clean this guy up */ nexpire[hash]--; /* free packets Qed at the end of this entry */ for (rte = rt->mfc_stall; rte != NULL; ) { struct rtdetq *n = rte->next; ip_mdq(rte->m, rte->ifp, rt, -1); m_freem(rte->m); free(rte, M_MRTABLE); rte = n; } rt->mfc_stall = NULL; } } /* * It is possible that an entry is being inserted without an upcall */ if (nstl == 0) { if (mrtdebug & DEBUG_MFC) log(LOG_DEBUG,"add_mfc no upcall h %lu o %lx g %lx p %x\n", hash, (u_long)ntohl(mfccp->mfcc_origin.s_addr), (u_long)ntohl(mfccp->mfcc_mcastgrp.s_addr), mfccp->mfcc_parent); for (rt = mfctable[hash]; rt != NULL; rt = rt->mfc_next) { if ((rt->mfc_origin.s_addr == mfccp->mfcc_origin.s_addr) && (rt->mfc_mcastgrp.s_addr == mfccp->mfcc_mcastgrp.s_addr)) { init_mfc_params(rt, mfccp); if (rt->mfc_expire) nexpire[hash]--; rt->mfc_expire = 0; break; /* XXX */ } } if (rt == NULL) { /* no upcall, so make a new entry */ rt = (struct mfc *)malloc(sizeof(*rt), M_MRTABLE, M_NOWAIT); if (rt == NULL) { MFC_UNLOCK(); VIF_UNLOCK(); return ENOBUFS; } init_mfc_params(rt, mfccp); rt->mfc_expire = 0; rt->mfc_stall = NULL; rt->mfc_bw_meter = NULL; /* insert new entry at head of hash chain */ rt->mfc_next = mfctable[hash]; mfctable[hash] = rt; } } MFC_UNLOCK(); VIF_UNLOCK(); return 0; } /* * Delete an mfc entry */ static int del_mfc(struct mfcctl2 *mfccp) { struct in_addr origin; struct in_addr mcastgrp; struct mfc *rt; struct mfc **nptr; u_long hash; struct bw_meter *list; origin = mfccp->mfcc_origin; mcastgrp = mfccp->mfcc_mcastgrp; if (mrtdebug & DEBUG_MFC) log(LOG_DEBUG,"del_mfc orig %lx mcastgrp %lx\n", (u_long)ntohl(origin.s_addr), (u_long)ntohl(mcastgrp.s_addr)); MFC_LOCK(); hash = MFCHASH(origin.s_addr, mcastgrp.s_addr); for (nptr = &mfctable[hash]; (rt = *nptr) != NULL; nptr = &rt->mfc_next) if (origin.s_addr == rt->mfc_origin.s_addr && mcastgrp.s_addr == rt->mfc_mcastgrp.s_addr && rt->mfc_stall == NULL) break; if (rt == NULL) { MFC_UNLOCK(); return EADDRNOTAVAIL; } *nptr = rt->mfc_next; /* * free the bw_meter entries */ list = rt->mfc_bw_meter; rt->mfc_bw_meter = NULL; free(rt, M_MRTABLE); free_bw_list(list); MFC_UNLOCK(); return 0; } /* * Send a message to mrouted on the multicast routing socket */ static int socket_send(struct socket *s, struct mbuf *mm, struct sockaddr_in *src) { if (s) { - if (sbappendaddr(&s->so_rcv, (struct sockaddr *)src, mm, NULL) != 0) { - sorwakeup(s); + SOCKBUF_LOCK(&s->so_rcv); + if (sbappendaddr_locked(&s->so_rcv, (struct sockaddr *)src, mm, + NULL) != 0) { + sorwakeup_locked(s); return 0; } + SOCKBUF_UNLOCK(&s->so_rcv); } m_freem(mm); return -1; } /* * IP multicast forwarding function. This function assumes that the packet * pointed to by "ip" has arrived on (or is about to be sent to) the interface * pointed to by "ifp", and the packet is to be relayed to other networks * that have members of the packet's destination IP multicast group. * * The packet is returned unscathed to the caller, unless it is * erroneous, in which case a non-zero return value tells the caller to * discard it. */ #define TUNNEL_LEN 12 /* # bytes of IP option for tunnel encapsulation */ static int X_ip_mforward(struct ip *ip, struct ifnet *ifp, struct mbuf *m, struct ip_moptions *imo) { struct mfc *rt; int error; vifi_t vifi; if (mrtdebug & DEBUG_FORWARD) log(LOG_DEBUG, "ip_mforward: src %lx, dst %lx, ifp %p\n", (u_long)ntohl(ip->ip_src.s_addr), (u_long)ntohl(ip->ip_dst.s_addr), (void *)ifp); if (ip->ip_hl < (sizeof(struct ip) + TUNNEL_LEN) >> 2 || ((u_char *)(ip + 1))[1] != IPOPT_LSRR ) { /* * Packet arrived via a physical interface or * an encapsulated tunnel or a register_vif. */ } else { /* * Packet arrived through a source-route tunnel. * Source-route tunnels are no longer supported. */ static int last_log; if (last_log != time_second) { last_log = time_second; log(LOG_ERR, "ip_mforward: received source-routed packet from %lx\n", (u_long)ntohl(ip->ip_src.s_addr)); } return 1; } VIF_LOCK(); MFC_LOCK(); if (imo && ((vifi = imo->imo_multicast_vif) < numvifs)) { if (ip->ip_ttl < 255) ip->ip_ttl++; /* compensate for -1 in *_send routines */ if (rsvpdebug && ip->ip_p == IPPROTO_RSVP) { struct vif *vifp = viftable + vifi; printf("Sending IPPROTO_RSVP from %lx to %lx on vif %d (%s%s)\n", (long)ntohl(ip->ip_src.s_addr), (long)ntohl(ip->ip_dst.s_addr), vifi, (vifp->v_flags & VIFF_TUNNEL) ? "tunnel on " : "", vifp->v_ifp->if_xname); } error = ip_mdq(m, ifp, NULL, vifi); MFC_UNLOCK(); VIF_UNLOCK(); return error; } if (rsvpdebug && ip->ip_p == IPPROTO_RSVP) { printf("Warning: IPPROTO_RSVP from %lx to %lx without vif option\n", (long)ntohl(ip->ip_src.s_addr), (long)ntohl(ip->ip_dst.s_addr)); if (!imo) printf("In fact, no options were specified at all\n"); } /* * Don't forward a packet with time-to-live of zero or one, * or a packet destined to a local-only group. */ if (ip->ip_ttl <= 1 || ntohl(ip->ip_dst.s_addr) <= INADDR_MAX_LOCAL_GROUP) { MFC_UNLOCK(); VIF_UNLOCK(); return 0; } /* * Determine forwarding vifs from the forwarding cache table */ ++mrtstat.mrts_mfc_lookups; rt = mfc_find(ip->ip_src.s_addr, ip->ip_dst.s_addr); /* Entry exists, so forward if necessary */ if (rt != NULL) { error = ip_mdq(m, ifp, rt, -1); MFC_UNLOCK(); VIF_UNLOCK(); return error; } else { /* * If we don't have a route for packet's origin, * Make a copy of the packet & send message to routing daemon */ struct mbuf *mb0; struct rtdetq *rte; u_long hash; int hlen = ip->ip_hl << 2; ++mrtstat.mrts_mfc_misses; mrtstat.mrts_no_route++; if (mrtdebug & (DEBUG_FORWARD | DEBUG_MFC)) log(LOG_DEBUG, "ip_mforward: no rte s %lx g %lx\n", (u_long)ntohl(ip->ip_src.s_addr), (u_long)ntohl(ip->ip_dst.s_addr)); /* * Allocate mbufs early so that we don't do extra work if we are * just going to fail anyway. Make sure to pullup the header so * that other people can't step on it. */ rte = (struct rtdetq *)malloc((sizeof *rte), M_MRTABLE, M_NOWAIT); if (rte == NULL) { MFC_UNLOCK(); VIF_UNLOCK(); return ENOBUFS; } mb0 = m_copypacket(m, M_DONTWAIT); if (mb0 && (M_HASCL(mb0) || mb0->m_len < hlen)) mb0 = m_pullup(mb0, hlen); if (mb0 == NULL) { free(rte, M_MRTABLE); MFC_UNLOCK(); VIF_UNLOCK(); return ENOBUFS; } /* is there an upcall waiting for this flow ? */ hash = MFCHASH(ip->ip_src.s_addr, ip->ip_dst.s_addr); for (rt = mfctable[hash]; rt; rt = rt->mfc_next) { if ((ip->ip_src.s_addr == rt->mfc_origin.s_addr) && (ip->ip_dst.s_addr == rt->mfc_mcastgrp.s_addr) && (rt->mfc_stall != NULL)) break; } if (rt == NULL) { int i; struct igmpmsg *im; struct sockaddr_in k_igmpsrc = { sizeof k_igmpsrc, AF_INET }; struct mbuf *mm; /* * Locate the vifi for the incoming interface for this packet. * If none found, drop packet. */ for (vifi=0; vifi < numvifs && viftable[vifi].v_ifp != ifp; vifi++) ; if (vifi >= numvifs) /* vif not found, drop packet */ goto non_fatal; /* no upcall, so make a new entry */ rt = (struct mfc *)malloc(sizeof(*rt), M_MRTABLE, M_NOWAIT); if (rt == NULL) goto fail; /* Make a copy of the header to send to the user level process */ mm = m_copy(mb0, 0, hlen); if (mm == NULL) goto fail1; /* * Send message to routing daemon to install * a route into the kernel table */ im = mtod(mm, struct igmpmsg *); im->im_msgtype = IGMPMSG_NOCACHE; im->im_mbz = 0; im->im_vif = vifi; mrtstat.mrts_upcalls++; k_igmpsrc.sin_addr = ip->ip_src; if (socket_send(ip_mrouter, mm, &k_igmpsrc) < 0) { log(LOG_WARNING, "ip_mforward: ip_mrouter socket queue full\n"); ++mrtstat.mrts_upq_sockfull; fail1: free(rt, M_MRTABLE); fail: free(rte, M_MRTABLE); m_freem(mb0); MFC_UNLOCK(); VIF_UNLOCK(); return ENOBUFS; } /* insert new entry at head of hash chain */ rt->mfc_origin.s_addr = ip->ip_src.s_addr; rt->mfc_mcastgrp.s_addr = ip->ip_dst.s_addr; rt->mfc_expire = UPCALL_EXPIRE; nexpire[hash]++; for (i = 0; i < numvifs; i++) { rt->mfc_ttls[i] = 0; rt->mfc_flags[i] = 0; } rt->mfc_parent = -1; rt->mfc_rp.s_addr = INADDR_ANY; /* clear the RP address */ rt->mfc_bw_meter = NULL; /* link into table */ rt->mfc_next = mfctable[hash]; mfctable[hash] = rt; rt->mfc_stall = rte; } else { /* determine if q has overflowed */ int npkts = 0; struct rtdetq **p; /* * XXX ouch! we need to append to the list, but we * only have a pointer to the front, so we have to * scan the entire list every time. */ for (p = &rt->mfc_stall; *p != NULL; p = &(*p)->next) npkts++; if (npkts > MAX_UPQ) { mrtstat.mrts_upq_ovflw++; non_fatal: free(rte, M_MRTABLE); m_freem(mb0); MFC_UNLOCK(); VIF_UNLOCK(); return 0; } /* Add this entry to the end of the queue */ *p = rte; } rte->m = mb0; rte->ifp = ifp; rte->next = NULL; MFC_UNLOCK(); VIF_UNLOCK(); return 0; } } /* * Clean up the cache entry if upcall is not serviced */ static void expire_upcalls(void *unused) { struct rtdetq *rte; struct mfc *mfc, **nptr; int i; MFC_LOCK(); for (i = 0; i < MFCTBLSIZ; i++) { if (nexpire[i] == 0) continue; nptr = &mfctable[i]; for (mfc = *nptr; mfc != NULL; mfc = *nptr) { /* * Skip real cache entries * Make sure it wasn't marked to not expire (shouldn't happen) * If it expires now */ if (mfc->mfc_stall != NULL && mfc->mfc_expire != 0 && --mfc->mfc_expire == 0) { if (mrtdebug & DEBUG_EXPIRE) log(LOG_DEBUG, "expire_upcalls: expiring (%lx %lx)\n", (u_long)ntohl(mfc->mfc_origin.s_addr), (u_long)ntohl(mfc->mfc_mcastgrp.s_addr)); /* * drop all the packets * free the mbuf with the pkt, if, timing info */ for (rte = mfc->mfc_stall; rte; ) { struct rtdetq *n = rte->next; m_freem(rte->m); free(rte, M_MRTABLE); rte = n; } ++mrtstat.mrts_cache_cleanups; nexpire[i]--; /* * free the bw_meter entries */ while (mfc->mfc_bw_meter != NULL) { struct bw_meter *x = mfc->mfc_bw_meter; mfc->mfc_bw_meter = x->bm_mfc_next; free(x, M_BWMETER); } *nptr = mfc->mfc_next; free(mfc, M_MRTABLE); } else { nptr = &mfc->mfc_next; } } } MFC_UNLOCK(); callout_reset(&expire_upcalls_ch, EXPIRE_TIMEOUT, expire_upcalls, NULL); } /* * Packet forwarding routine once entry in the cache is made */ static int ip_mdq(struct mbuf *m, struct ifnet *ifp, struct mfc *rt, vifi_t xmt_vif) { struct ip *ip = mtod(m, struct ip *); vifi_t vifi; int plen = ip->ip_len; VIF_LOCK_ASSERT(); /* * Macro to send packet on vif. Since RSVP packets don't get counted on * input, they shouldn't get counted on output, so statistics keeping is * separate. */ #define MC_SEND(ip,vifp,m) { \ if ((vifp)->v_flags & VIFF_TUNNEL) \ encap_send((ip), (vifp), (m)); \ else \ phyint_send((ip), (vifp), (m)); \ } /* * If xmt_vif is not -1, send on only the requested vif. * * (since vifi_t is u_short, -1 becomes MAXUSHORT, which > numvifs.) */ if (xmt_vif < numvifs) { #ifdef PIM if (viftable[xmt_vif].v_flags & VIFF_REGISTER) pim_register_send(ip, viftable + xmt_vif, m, rt); else #endif MC_SEND(ip, viftable + xmt_vif, m); return 1; } /* * Don't forward if it didn't arrive from the parent vif for its origin. */ vifi = rt->mfc_parent; if ((vifi >= numvifs) || (viftable[vifi].v_ifp != ifp)) { /* came in the wrong interface */ if (mrtdebug & DEBUG_FORWARD) log(LOG_DEBUG, "wrong if: ifp %p vifi %d vififp %p\n", (void *)ifp, vifi, (void *)viftable[vifi].v_ifp); ++mrtstat.mrts_wrong_if; ++rt->mfc_wrong_if; /* * If we are doing PIM assert processing, send a message * to the routing daemon. * * XXX: A PIM-SM router needs the WRONGVIF detection so it * can complete the SPT switch, regardless of the type * of the iif (broadcast media, GRE tunnel, etc). */ if (pim_assert && (vifi < numvifs) && viftable[vifi].v_ifp) { struct timeval now; u_long delta; #ifdef PIM if (ifp == &multicast_register_if) pimstat.pims_rcv_registers_wrongiif++; #endif /* Get vifi for the incoming packet */ for (vifi=0; vifi < numvifs && viftable[vifi].v_ifp != ifp; vifi++) ; if (vifi >= numvifs) return 0; /* The iif is not found: ignore the packet. */ if (rt->mfc_flags[vifi] & MRT_MFC_FLAGS_DISABLE_WRONGVIF) return 0; /* WRONGVIF disabled: ignore the packet */ GET_TIME(now); TV_DELTA(rt->mfc_last_assert, now, delta); if (delta > ASSERT_MSG_TIME) { struct sockaddr_in k_igmpsrc = { sizeof k_igmpsrc, AF_INET }; struct igmpmsg *im; int hlen = ip->ip_hl << 2; struct mbuf *mm = m_copy(m, 0, hlen); if (mm && (M_HASCL(mm) || mm->m_len < hlen)) mm = m_pullup(mm, hlen); if (mm == NULL) return ENOBUFS; rt->mfc_last_assert = now; im = mtod(mm, struct igmpmsg *); im->im_msgtype = IGMPMSG_WRONGVIF; im->im_mbz = 0; im->im_vif = vifi; mrtstat.mrts_upcalls++; k_igmpsrc.sin_addr = im->im_src; if (socket_send(ip_mrouter, mm, &k_igmpsrc) < 0) { log(LOG_WARNING, "ip_mforward: ip_mrouter socket queue full\n"); ++mrtstat.mrts_upq_sockfull; return ENOBUFS; } } } return 0; } /* If I sourced this packet, it counts as output, else it was input. */ if (ip->ip_src.s_addr == viftable[vifi].v_lcl_addr.s_addr) { viftable[vifi].v_pkt_out++; viftable[vifi].v_bytes_out += plen; } else { viftable[vifi].v_pkt_in++; viftable[vifi].v_bytes_in += plen; } rt->mfc_pkt_cnt++; rt->mfc_byte_cnt += plen; /* * For each vif, decide if a copy of the packet should be forwarded. * Forward if: * - the ttl exceeds the vif's threshold * - there are group members downstream on interface */ for (vifi = 0; vifi < numvifs; vifi++) if ((rt->mfc_ttls[vifi] > 0) && (ip->ip_ttl > rt->mfc_ttls[vifi])) { viftable[vifi].v_pkt_out++; viftable[vifi].v_bytes_out += plen; #ifdef PIM if (viftable[vifi].v_flags & VIFF_REGISTER) pim_register_send(ip, viftable + vifi, m, rt); else #endif MC_SEND(ip, viftable+vifi, m); } /* * Perform upcall-related bw measuring. */ if (rt->mfc_bw_meter != NULL) { struct bw_meter *x; struct timeval now; GET_TIME(now); MFC_LOCK_ASSERT(); for (x = rt->mfc_bw_meter; x != NULL; x = x->bm_mfc_next) bw_meter_receive_packet(x, plen, &now); } return 0; } /* * check if a vif number is legal/ok. This is used by ip_output. */ static int X_legal_vif_num(int vif) { /* XXX unlocked, matter? */ return (vif >= 0 && vif < numvifs); } /* * Return the local address used by this vif */ static u_long X_ip_mcast_src(int vifi) { /* XXX unlocked, matter? */ if (vifi >= 0 && vifi < numvifs) return viftable[vifi].v_lcl_addr.s_addr; else return INADDR_ANY; } static void phyint_send(struct ip *ip, struct vif *vifp, struct mbuf *m) { struct mbuf *mb_copy; int hlen = ip->ip_hl << 2; VIF_LOCK_ASSERT(); /* * Make a new reference to the packet; make sure that * the IP header is actually copied, not just referenced, * so that ip_output() only scribbles on the copy. */ mb_copy = m_copypacket(m, M_DONTWAIT); if (mb_copy && (M_HASCL(mb_copy) || mb_copy->m_len < hlen)) mb_copy = m_pullup(mb_copy, hlen); if (mb_copy == NULL) return; if (vifp->v_rate_limit == 0) tbf_send_packet(vifp, mb_copy); else tbf_control(vifp, mb_copy, mtod(mb_copy, struct ip *), ip->ip_len); } static void encap_send(struct ip *ip, struct vif *vifp, struct mbuf *m) { struct mbuf *mb_copy; struct ip *ip_copy; int i, len = ip->ip_len; VIF_LOCK_ASSERT(); /* Take care of delayed checksums */ if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { in_delayed_cksum(m); m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; } /* * copy the old packet & pullup its IP header into the * new mbuf so we can modify it. Try to fill the new * mbuf since if we don't the ethernet driver will. */ MGETHDR(mb_copy, M_DONTWAIT, MT_HEADER); if (mb_copy == NULL) return; #ifdef MAC mac_create_mbuf_multicast_encap(m, vifp->v_ifp, mb_copy); #endif mb_copy->m_data += max_linkhdr; mb_copy->m_len = sizeof(multicast_encap_iphdr); if ((mb_copy->m_next = m_copypacket(m, M_DONTWAIT)) == NULL) { m_freem(mb_copy); return; } i = MHLEN - M_LEADINGSPACE(mb_copy); if (i > len) i = len; mb_copy = m_pullup(mb_copy, i); if (mb_copy == NULL) return; mb_copy->m_pkthdr.len = len + sizeof(multicast_encap_iphdr); /* * fill in the encapsulating IP header. */ ip_copy = mtod(mb_copy, struct ip *); *ip_copy = multicast_encap_iphdr; #ifdef RANDOM_IP_ID ip_copy->ip_id = ip_randomid(); #else ip_copy->ip_id = htons(ip_id++); #endif ip_copy->ip_len += len; ip_copy->ip_src = vifp->v_lcl_addr; ip_copy->ip_dst = vifp->v_rmt_addr; /* * turn the encapsulated IP header back into a valid one. */ ip = (struct ip *)((caddr_t)ip_copy + sizeof(multicast_encap_iphdr)); --ip->ip_ttl; ip->ip_len = htons(ip->ip_len); ip->ip_off = htons(ip->ip_off); ip->ip_sum = 0; mb_copy->m_data += sizeof(multicast_encap_iphdr); ip->ip_sum = in_cksum(mb_copy, ip->ip_hl << 2); mb_copy->m_data -= sizeof(multicast_encap_iphdr); if (vifp->v_rate_limit == 0) tbf_send_packet(vifp, mb_copy); else tbf_control(vifp, mb_copy, ip, ip_copy->ip_len); } /* * Token bucket filter module */ static void tbf_control(struct vif *vifp, struct mbuf *m, struct ip *ip, u_long p_len) { struct tbf *t = vifp->v_tbf; VIF_LOCK_ASSERT(); if (p_len > MAX_BKT_SIZE) { /* drop if packet is too large */ mrtstat.mrts_pkt2large++; m_freem(m); return; } tbf_update_tokens(vifp); if (t->tbf_q_len == 0) { /* queue empty... */ if (p_len <= t->tbf_n_tok) { /* send packet if enough tokens */ t->tbf_n_tok -= p_len; tbf_send_packet(vifp, m); } else { /* no, queue packet and try later */ tbf_queue(vifp, m); callout_reset(&tbf_reprocess_ch, TBF_REPROCESS, tbf_reprocess_q, vifp); } } else if (t->tbf_q_len < t->tbf_max_q_len) { /* finite queue length, so queue pkts and process queue */ tbf_queue(vifp, m); tbf_process_q(vifp); } else { /* queue full, try to dq and queue and process */ if (!tbf_dq_sel(vifp, ip)) { mrtstat.mrts_q_overflow++; m_freem(m); } else { tbf_queue(vifp, m); tbf_process_q(vifp); } } } /* * adds a packet to the queue at the interface */ static void tbf_queue(struct vif *vifp, struct mbuf *m) { struct tbf *t = vifp->v_tbf; VIF_LOCK_ASSERT(); if (t->tbf_t == NULL) /* Queue was empty */ t->tbf_q = m; else /* Insert at tail */ t->tbf_t->m_act = m; t->tbf_t = m; /* Set new tail pointer */ #ifdef DIAGNOSTIC /* Make sure we didn't get fed a bogus mbuf */ if (m->m_act) panic("tbf_queue: m_act"); #endif m->m_act = NULL; t->tbf_q_len++; } /* * processes the queue at the interface */ static void tbf_process_q(struct vif *vifp) { struct tbf *t = vifp->v_tbf; VIF_LOCK_ASSERT(); /* loop through the queue at the interface and send as many packets * as possible */ while (t->tbf_q_len > 0) { struct mbuf *m = t->tbf_q; int len = mtod(m, struct ip *)->ip_len; /* determine if the packet can be sent */ if (len > t->tbf_n_tok) /* not enough tokens, we are done */ break; /* ok, reduce no of tokens, dequeue and send the packet. */ t->tbf_n_tok -= len; t->tbf_q = m->m_act; if (--t->tbf_q_len == 0) t->tbf_t = NULL; m->m_act = NULL; tbf_send_packet(vifp, m); } } static void tbf_reprocess_q(void *xvifp) { struct vif *vifp = xvifp; if (ip_mrouter == NULL) return; VIF_LOCK(); tbf_update_tokens(vifp); tbf_process_q(vifp); if (vifp->v_tbf->tbf_q_len) callout_reset(&tbf_reprocess_ch, TBF_REPROCESS, tbf_reprocess_q, vifp); VIF_UNLOCK(); } /* function that will selectively discard a member of the queue * based on the precedence value and the priority */ static int tbf_dq_sel(struct vif *vifp, struct ip *ip) { u_int p; struct mbuf *m, *last; struct mbuf **np; struct tbf *t = vifp->v_tbf; VIF_LOCK_ASSERT(); p = priority(vifp, ip); np = &t->tbf_q; last = NULL; while ((m = *np) != NULL) { if (p > priority(vifp, mtod(m, struct ip *))) { *np = m->m_act; /* If we're removing the last packet, fix the tail pointer */ if (m == t->tbf_t) t->tbf_t = last; m_freem(m); /* It's impossible for the queue to be empty, but check anyways. */ if (--t->tbf_q_len == 0) t->tbf_t = NULL; mrtstat.mrts_drop_sel++; return 1; } np = &m->m_act; last = m; } return 0; } static void tbf_send_packet(struct vif *vifp, struct mbuf *m) { VIF_LOCK_ASSERT(); if (vifp->v_flags & VIFF_TUNNEL) /* If tunnel options */ ip_output(m, NULL, &vifp->v_route, IP_FORWARDING, NULL, NULL); else { struct ip_moptions imo; int error; static struct route ro; /* XXX check this */ imo.imo_multicast_ifp = vifp->v_ifp; imo.imo_multicast_ttl = mtod(m, struct ip *)->ip_ttl - 1; imo.imo_multicast_loop = 1; imo.imo_multicast_vif = -1; /* * Re-entrancy should not be a problem here, because * the packets that we send out and are looped back at us * should get rejected because they appear to come from * the loopback interface, thus preventing looping. */ error = ip_output(m, NULL, &ro, IP_FORWARDING, &imo, NULL); if (mrtdebug & DEBUG_XMIT) log(LOG_DEBUG, "phyint_send on vif %d err %d\n", (int)(vifp - viftable), error); } } /* determine the current time and then * the elapsed time (between the last time and time now) * in milliseconds & update the no. of tokens in the bucket */ static void tbf_update_tokens(struct vif *vifp) { struct timeval tp; u_long tm; struct tbf *t = vifp->v_tbf; VIF_LOCK_ASSERT(); GET_TIME(tp); TV_DELTA(tp, t->tbf_last_pkt_t, tm); /* * This formula is actually * "time in seconds" * "bytes/second". * * (tm / 1000000) * (v_rate_limit * 1000 * (1000/1024) / 8) * * The (1000/1024) was introduced in add_vif to optimize * this divide into a shift. */ t->tbf_n_tok += tm * vifp->v_rate_limit / 1024 / 8; t->tbf_last_pkt_t = tp; if (t->tbf_n_tok > MAX_BKT_SIZE) t->tbf_n_tok = MAX_BKT_SIZE; } static int priority(struct vif *vifp, struct ip *ip) { int prio = 50; /* the lowest priority -- default case */ /* temporary hack; may add general packet classifier some day */ /* * The UDP port space is divided up into four priority ranges: * [0, 16384) : unclassified - lowest priority * [16384, 32768) : audio - highest priority * [32768, 49152) : whiteboard - medium priority * [49152, 65536) : video - low priority * * Everything else gets lowest priority. */ if (ip->ip_p == IPPROTO_UDP) { struct udphdr *udp = (struct udphdr *)(((char *)ip) + (ip->ip_hl << 2)); switch (ntohs(udp->uh_dport) & 0xc000) { case 0x4000: prio = 70; break; case 0x8000: prio = 60; break; case 0xc000: prio = 55; break; } } return prio; } /* * End of token bucket filter modifications */ static int X_ip_rsvp_vif(struct socket *so, struct sockopt *sopt) { int error, vifi; if (so->so_type != SOCK_RAW || so->so_proto->pr_protocol != IPPROTO_RSVP) return EOPNOTSUPP; error = sooptcopyin(sopt, &vifi, sizeof vifi, sizeof vifi); if (error) return error; VIF_LOCK(); if (vifi < 0 || vifi >= numvifs) { /* Error if vif is invalid */ VIF_UNLOCK(); return EADDRNOTAVAIL; } if (sopt->sopt_name == IP_RSVP_VIF_ON) { /* Check if socket is available. */ if (viftable[vifi].v_rsvpd != NULL) { VIF_UNLOCK(); return EADDRINUSE; } viftable[vifi].v_rsvpd = so; /* This may seem silly, but we need to be sure we don't over-increment * the RSVP counter, in case something slips up. */ if (!viftable[vifi].v_rsvp_on) { viftable[vifi].v_rsvp_on = 1; rsvp_on++; } } else { /* must be VIF_OFF */ /* * XXX as an additional consistency check, one could make sure * that viftable[vifi].v_rsvpd == so, otherwise passing so as * first parameter is pretty useless. */ viftable[vifi].v_rsvpd = NULL; /* * This may seem silly, but we need to be sure we don't over-decrement * the RSVP counter, in case something slips up. */ if (viftable[vifi].v_rsvp_on) { viftable[vifi].v_rsvp_on = 0; rsvp_on--; } } VIF_UNLOCK(); return 0; } static void X_ip_rsvp_force_done(struct socket *so) { int vifi; /* Don't bother if it is not the right type of socket. */ if (so->so_type != SOCK_RAW || so->so_proto->pr_protocol != IPPROTO_RSVP) return; VIF_LOCK(); /* The socket may be attached to more than one vif...this * is perfectly legal. */ for (vifi = 0; vifi < numvifs; vifi++) { if (viftable[vifi].v_rsvpd == so) { viftable[vifi].v_rsvpd = NULL; /* This may seem silly, but we need to be sure we don't * over-decrement the RSVP counter, in case something slips up. */ if (viftable[vifi].v_rsvp_on) { viftable[vifi].v_rsvp_on = 0; rsvp_on--; } } } VIF_UNLOCK(); } static void X_rsvp_input(struct mbuf *m, int off) { int vifi; struct ip *ip = mtod(m, struct ip *); struct sockaddr_in rsvp_src = { sizeof rsvp_src, AF_INET }; struct ifnet *ifp; if (rsvpdebug) printf("rsvp_input: rsvp_on %d\n",rsvp_on); /* Can still get packets with rsvp_on = 0 if there is a local member * of the group to which the RSVP packet is addressed. But in this * case we want to throw the packet away. */ if (!rsvp_on) { m_freem(m); return; } if (rsvpdebug) printf("rsvp_input: check vifs\n"); #ifdef DIAGNOSTIC M_ASSERTPKTHDR(m); #endif ifp = m->m_pkthdr.rcvif; VIF_LOCK(); /* Find which vif the packet arrived on. */ for (vifi = 0; vifi < numvifs; vifi++) if (viftable[vifi].v_ifp == ifp) break; if (vifi == numvifs || viftable[vifi].v_rsvpd == NULL) { /* * Drop the lock here to avoid holding it across rip_input. * This could make rsvpdebug printfs wrong. If you care, * record the state of stuff before dropping the lock. */ VIF_UNLOCK(); /* * If the old-style non-vif-associated socket is set, * then use it. Otherwise, drop packet since there * is no specific socket for this vif. */ if (ip_rsvpd != NULL) { if (rsvpdebug) printf("rsvp_input: Sending packet up old-style socket\n"); rip_input(m, off); /* xxx */ } else { if (rsvpdebug && vifi == numvifs) printf("rsvp_input: Can't find vif for packet.\n"); else if (rsvpdebug && viftable[vifi].v_rsvpd == NULL) printf("rsvp_input: No socket defined for vif %d\n",vifi); m_freem(m); } return; } rsvp_src.sin_addr = ip->ip_src; if (rsvpdebug && m) printf("rsvp_input: m->m_len = %d, sbspace() = %ld\n", m->m_len,sbspace(&(viftable[vifi].v_rsvpd->so_rcv))); if (socket_send(viftable[vifi].v_rsvpd, m, &rsvp_src) < 0) { if (rsvpdebug) printf("rsvp_input: Failed to append to socket\n"); } else { if (rsvpdebug) printf("rsvp_input: send packet up\n"); } VIF_UNLOCK(); } /* * Code for bandwidth monitors */ /* * Define common interface for timeval-related methods */ #define BW_TIMEVALCMP(tvp, uvp, cmp) timevalcmp((tvp), (uvp), cmp) #define BW_TIMEVALDECR(vvp, uvp) timevalsub((vvp), (uvp)) #define BW_TIMEVALADD(vvp, uvp) timevaladd((vvp), (uvp)) static uint32_t compute_bw_meter_flags(struct bw_upcall *req) { uint32_t flags = 0; if (req->bu_flags & BW_UPCALL_UNIT_PACKETS) flags |= BW_METER_UNIT_PACKETS; if (req->bu_flags & BW_UPCALL_UNIT_BYTES) flags |= BW_METER_UNIT_BYTES; if (req->bu_flags & BW_UPCALL_GEQ) flags |= BW_METER_GEQ; if (req->bu_flags & BW_UPCALL_LEQ) flags |= BW_METER_LEQ; return flags; } /* * Add a bw_meter entry */ static int add_bw_upcall(struct bw_upcall *req) { struct mfc *mfc; struct timeval delta = { BW_UPCALL_THRESHOLD_INTERVAL_MIN_SEC, BW_UPCALL_THRESHOLD_INTERVAL_MIN_USEC }; struct timeval now; struct bw_meter *x; uint32_t flags; if (!(mrt_api_config & MRT_MFC_BW_UPCALL)) return EOPNOTSUPP; /* Test if the flags are valid */ if (!(req->bu_flags & (BW_UPCALL_UNIT_PACKETS | BW_UPCALL_UNIT_BYTES))) return EINVAL; if (!(req->bu_flags & (BW_UPCALL_GEQ | BW_UPCALL_LEQ))) return EINVAL; if ((req->bu_flags & (BW_UPCALL_GEQ | BW_UPCALL_LEQ)) == (BW_UPCALL_GEQ | BW_UPCALL_LEQ)) return EINVAL; /* Test if the threshold time interval is valid */ if (BW_TIMEVALCMP(&req->bu_threshold.b_time, &delta, <)) return EINVAL; flags = compute_bw_meter_flags(req); /* * Find if we have already same bw_meter entry */ MFC_LOCK(); mfc = mfc_find(req->bu_src.s_addr, req->bu_dst.s_addr); if (mfc == NULL) { MFC_UNLOCK(); return EADDRNOTAVAIL; } for (x = mfc->mfc_bw_meter; x != NULL; x = x->bm_mfc_next) { if ((BW_TIMEVALCMP(&x->bm_threshold.b_time, &req->bu_threshold.b_time, ==)) && (x->bm_threshold.b_packets == req->bu_threshold.b_packets) && (x->bm_threshold.b_bytes == req->bu_threshold.b_bytes) && (x->bm_flags & BW_METER_USER_FLAGS) == flags) { MFC_UNLOCK(); return 0; /* XXX Already installed */ } } /* Allocate the new bw_meter entry */ x = (struct bw_meter *)malloc(sizeof(*x), M_BWMETER, M_NOWAIT); if (x == NULL) { MFC_UNLOCK(); return ENOBUFS; } /* Set the new bw_meter entry */ x->bm_threshold.b_time = req->bu_threshold.b_time; GET_TIME(now); x->bm_start_time = now; x->bm_threshold.b_packets = req->bu_threshold.b_packets; x->bm_threshold.b_bytes = req->bu_threshold.b_bytes; x->bm_measured.b_packets = 0; x->bm_measured.b_bytes = 0; x->bm_flags = flags; x->bm_time_next = NULL; x->bm_time_hash = BW_METER_BUCKETS; /* Add the new bw_meter entry to the front of entries for this MFC */ x->bm_mfc = mfc; x->bm_mfc_next = mfc->mfc_bw_meter; mfc->mfc_bw_meter = x; schedule_bw_meter(x, &now); MFC_UNLOCK(); return 0; } static void free_bw_list(struct bw_meter *list) { while (list != NULL) { struct bw_meter *x = list; list = list->bm_mfc_next; unschedule_bw_meter(x); free(x, M_BWMETER); } } /* * Delete one or multiple bw_meter entries */ static int del_bw_upcall(struct bw_upcall *req) { struct mfc *mfc; struct bw_meter *x; if (!(mrt_api_config & MRT_MFC_BW_UPCALL)) return EOPNOTSUPP; MFC_LOCK(); /* Find the corresponding MFC entry */ mfc = mfc_find(req->bu_src.s_addr, req->bu_dst.s_addr); if (mfc == NULL) { MFC_UNLOCK(); return EADDRNOTAVAIL; } else if (req->bu_flags & BW_UPCALL_DELETE_ALL) { /* * Delete all bw_meter entries for this mfc */ struct bw_meter *list; list = mfc->mfc_bw_meter; mfc->mfc_bw_meter = NULL; free_bw_list(list); MFC_UNLOCK(); return 0; } else { /* Delete a single bw_meter entry */ struct bw_meter *prev; uint32_t flags = 0; flags = compute_bw_meter_flags(req); /* Find the bw_meter entry to delete */ for (prev = NULL, x = mfc->mfc_bw_meter; x != NULL; x = x->bm_mfc_next) { if ((BW_TIMEVALCMP(&x->bm_threshold.b_time, &req->bu_threshold.b_time, ==)) && (x->bm_threshold.b_packets == req->bu_threshold.b_packets) && (x->bm_threshold.b_bytes == req->bu_threshold.b_bytes) && (x->bm_flags & BW_METER_USER_FLAGS) == flags) break; } if (x != NULL) { /* Delete entry from the list for this MFC */ if (prev != NULL) prev->bm_mfc_next = x->bm_mfc_next; /* remove from middle*/ else x->bm_mfc->mfc_bw_meter = x->bm_mfc_next;/* new head of list */ unschedule_bw_meter(x); MFC_UNLOCK(); /* Free the bw_meter entry */ free(x, M_BWMETER); return 0; } else { MFC_UNLOCK(); return EINVAL; } } /* NOTREACHED */ } /* * Perform bandwidth measurement processing that may result in an upcall */ static void bw_meter_receive_packet(struct bw_meter *x, int plen, struct timeval *nowp) { struct timeval delta; MFC_LOCK_ASSERT(); delta = *nowp; BW_TIMEVALDECR(&delta, &x->bm_start_time); if (x->bm_flags & BW_METER_GEQ) { /* * Processing for ">=" type of bw_meter entry */ if (BW_TIMEVALCMP(&delta, &x->bm_threshold.b_time, >)) { /* Reset the bw_meter entry */ x->bm_start_time = *nowp; x->bm_measured.b_packets = 0; x->bm_measured.b_bytes = 0; x->bm_flags &= ~BW_METER_UPCALL_DELIVERED; } /* Record that a packet is received */ x->bm_measured.b_packets++; x->bm_measured.b_bytes += plen; /* * Test if we should deliver an upcall */ if (!(x->bm_flags & BW_METER_UPCALL_DELIVERED)) { if (((x->bm_flags & BW_METER_UNIT_PACKETS) && (x->bm_measured.b_packets >= x->bm_threshold.b_packets)) || ((x->bm_flags & BW_METER_UNIT_BYTES) && (x->bm_measured.b_bytes >= x->bm_threshold.b_bytes))) { /* Prepare an upcall for delivery */ bw_meter_prepare_upcall(x, nowp); x->bm_flags |= BW_METER_UPCALL_DELIVERED; } } } else if (x->bm_flags & BW_METER_LEQ) { /* * Processing for "<=" type of bw_meter entry */ if (BW_TIMEVALCMP(&delta, &x->bm_threshold.b_time, >)) { /* * We are behind time with the multicast forwarding table * scanning for "<=" type of bw_meter entries, so test now * if we should deliver an upcall. */ if (((x->bm_flags & BW_METER_UNIT_PACKETS) && (x->bm_measured.b_packets <= x->bm_threshold.b_packets)) || ((x->bm_flags & BW_METER_UNIT_BYTES) && (x->bm_measured.b_bytes <= x->bm_threshold.b_bytes))) { /* Prepare an upcall for delivery */ bw_meter_prepare_upcall(x, nowp); } /* Reschedule the bw_meter entry */ unschedule_bw_meter(x); schedule_bw_meter(x, nowp); } /* Record that a packet is received */ x->bm_measured.b_packets++; x->bm_measured.b_bytes += plen; /* * Test if we should restart the measuring interval */ if ((x->bm_flags & BW_METER_UNIT_PACKETS && x->bm_measured.b_packets <= x->bm_threshold.b_packets) || (x->bm_flags & BW_METER_UNIT_BYTES && x->bm_measured.b_bytes <= x->bm_threshold.b_bytes)) { /* Don't restart the measuring interval */ } else { /* Do restart the measuring interval */ /* * XXX: note that we don't unschedule and schedule, because this * might be too much overhead per packet. Instead, when we process * all entries for a given timer hash bin, we check whether it is * really a timeout. If not, we reschedule at that time. */ x->bm_start_time = *nowp; x->bm_measured.b_packets = 0; x->bm_measured.b_bytes = 0; x->bm_flags &= ~BW_METER_UPCALL_DELIVERED; } } } /* * Prepare a bandwidth-related upcall */ static void bw_meter_prepare_upcall(struct bw_meter *x, struct timeval *nowp) { struct timeval delta; struct bw_upcall *u; MFC_LOCK_ASSERT(); /* * Compute the measured time interval */ delta = *nowp; BW_TIMEVALDECR(&delta, &x->bm_start_time); /* * If there are too many pending upcalls, deliver them now */ if (bw_upcalls_n >= BW_UPCALLS_MAX) bw_upcalls_send(); /* * Set the bw_upcall entry */ u = &bw_upcalls[bw_upcalls_n++]; u->bu_src = x->bm_mfc->mfc_origin; u->bu_dst = x->bm_mfc->mfc_mcastgrp; u->bu_threshold.b_time = x->bm_threshold.b_time; u->bu_threshold.b_packets = x->bm_threshold.b_packets; u->bu_threshold.b_bytes = x->bm_threshold.b_bytes; u->bu_measured.b_time = delta; u->bu_measured.b_packets = x->bm_measured.b_packets; u->bu_measured.b_bytes = x->bm_measured.b_bytes; u->bu_flags = 0; if (x->bm_flags & BW_METER_UNIT_PACKETS) u->bu_flags |= BW_UPCALL_UNIT_PACKETS; if (x->bm_flags & BW_METER_UNIT_BYTES) u->bu_flags |= BW_UPCALL_UNIT_BYTES; if (x->bm_flags & BW_METER_GEQ) u->bu_flags |= BW_UPCALL_GEQ; if (x->bm_flags & BW_METER_LEQ) u->bu_flags |= BW_UPCALL_LEQ; } /* * Send the pending bandwidth-related upcalls */ static void bw_upcalls_send(void) { struct mbuf *m; int len = bw_upcalls_n * sizeof(bw_upcalls[0]); struct sockaddr_in k_igmpsrc = { sizeof k_igmpsrc, AF_INET }; static struct igmpmsg igmpmsg = { 0, /* unused1 */ 0, /* unused2 */ IGMPMSG_BW_UPCALL,/* im_msgtype */ 0, /* im_mbz */ 0, /* im_vif */ 0, /* unused3 */ { 0 }, /* im_src */ { 0 } }; /* im_dst */ MFC_LOCK_ASSERT(); if (bw_upcalls_n == 0) return; /* No pending upcalls */ bw_upcalls_n = 0; /* * Allocate a new mbuf, initialize it with the header and * the payload for the pending calls. */ MGETHDR(m, M_DONTWAIT, MT_HEADER); if (m == NULL) { log(LOG_WARNING, "bw_upcalls_send: cannot allocate mbuf\n"); return; } m->m_len = m->m_pkthdr.len = 0; m_copyback(m, 0, sizeof(struct igmpmsg), (caddr_t)&igmpmsg); m_copyback(m, sizeof(struct igmpmsg), len, (caddr_t)&bw_upcalls[0]); /* * Send the upcalls * XXX do we need to set the address in k_igmpsrc ? */ mrtstat.mrts_upcalls++; if (socket_send(ip_mrouter, m, &k_igmpsrc) < 0) { log(LOG_WARNING, "bw_upcalls_send: ip_mrouter socket queue full\n"); ++mrtstat.mrts_upq_sockfull; } } /* * Compute the timeout hash value for the bw_meter entries */ #define BW_METER_TIMEHASH(bw_meter, hash) \ do { \ struct timeval next_timeval = (bw_meter)->bm_start_time; \ \ BW_TIMEVALADD(&next_timeval, &(bw_meter)->bm_threshold.b_time); \ (hash) = next_timeval.tv_sec; \ if (next_timeval.tv_usec) \ (hash)++; /* XXX: make sure we don't timeout early */ \ (hash) %= BW_METER_BUCKETS; \ } while (0) /* * Schedule a timer to process periodically bw_meter entry of type "<=" * by linking the entry in the proper hash bucket. */ static void schedule_bw_meter(struct bw_meter *x, struct timeval *nowp) { int time_hash; MFC_LOCK_ASSERT(); if (!(x->bm_flags & BW_METER_LEQ)) return; /* XXX: we schedule timers only for "<=" entries */ /* * Reset the bw_meter entry */ x->bm_start_time = *nowp; x->bm_measured.b_packets = 0; x->bm_measured.b_bytes = 0; x->bm_flags &= ~BW_METER_UPCALL_DELIVERED; /* * Compute the timeout hash value and insert the entry */ BW_METER_TIMEHASH(x, time_hash); x->bm_time_next = bw_meter_timers[time_hash]; bw_meter_timers[time_hash] = x; x->bm_time_hash = time_hash; } /* * Unschedule the periodic timer that processes bw_meter entry of type "<=" * by removing the entry from the proper hash bucket. */ static void unschedule_bw_meter(struct bw_meter *x) { int time_hash; struct bw_meter *prev, *tmp; MFC_LOCK_ASSERT(); if (!(x->bm_flags & BW_METER_LEQ)) return; /* XXX: we schedule timers only for "<=" entries */ /* * Compute the timeout hash value and delete the entry */ time_hash = x->bm_time_hash; if (time_hash >= BW_METER_BUCKETS) return; /* Entry was not scheduled */ for (prev = NULL, tmp = bw_meter_timers[time_hash]; tmp != NULL; prev = tmp, tmp = tmp->bm_time_next) if (tmp == x) break; if (tmp == NULL) panic("unschedule_bw_meter: bw_meter entry not found"); if (prev != NULL) prev->bm_time_next = x->bm_time_next; else bw_meter_timers[time_hash] = x->bm_time_next; x->bm_time_next = NULL; x->bm_time_hash = BW_METER_BUCKETS; } /* * Process all "<=" type of bw_meter that should be processed now, * and for each entry prepare an upcall if necessary. Each processed * entry is rescheduled again for the (periodic) processing. * * This is run periodically (once per second normally). On each round, * all the potentially matching entries are in the hash slot that we are * looking at. */ static void bw_meter_process() { static uint32_t last_tv_sec; /* last time we processed this */ uint32_t loops; int i; struct timeval now, process_endtime; GET_TIME(now); if (last_tv_sec == now.tv_sec) return; /* nothing to do */ loops = now.tv_sec - last_tv_sec; last_tv_sec = now.tv_sec; if (loops > BW_METER_BUCKETS) loops = BW_METER_BUCKETS; MFC_LOCK(); /* * Process all bins of bw_meter entries from the one after the last * processed to the current one. On entry, i points to the last bucket * visited, so we need to increment i at the beginning of the loop. */ for (i = (now.tv_sec - loops) % BW_METER_BUCKETS; loops > 0; loops--) { struct bw_meter *x, *tmp_list; if (++i >= BW_METER_BUCKETS) i = 0; /* Disconnect the list of bw_meter entries from the bin */ tmp_list = bw_meter_timers[i]; bw_meter_timers[i] = NULL; /* Process the list of bw_meter entries */ while (tmp_list != NULL) { x = tmp_list; tmp_list = tmp_list->bm_time_next; /* Test if the time interval is over */ process_endtime = x->bm_start_time; BW_TIMEVALADD(&process_endtime, &x->bm_threshold.b_time); if (BW_TIMEVALCMP(&process_endtime, &now, >)) { /* Not yet: reschedule, but don't reset */ int time_hash; BW_METER_TIMEHASH(x, time_hash); if (time_hash == i && process_endtime.tv_sec == now.tv_sec) { /* * XXX: somehow the bin processing is a bit ahead of time. * Put the entry in the next bin. */ if (++time_hash >= BW_METER_BUCKETS) time_hash = 0; } x->bm_time_next = bw_meter_timers[time_hash]; bw_meter_timers[time_hash] = x; x->bm_time_hash = time_hash; continue; } /* * Test if we should deliver an upcall */ if (((x->bm_flags & BW_METER_UNIT_PACKETS) && (x->bm_measured.b_packets <= x->bm_threshold.b_packets)) || ((x->bm_flags & BW_METER_UNIT_BYTES) && (x->bm_measured.b_bytes <= x->bm_threshold.b_bytes))) { /* Prepare an upcall for delivery */ bw_meter_prepare_upcall(x, &now); } /* * Reschedule for next processing */ schedule_bw_meter(x, &now); } } /* Send all upcalls that are pending delivery */ bw_upcalls_send(); MFC_UNLOCK(); } /* * A periodic function for sending all upcalls that are pending delivery */ static void expire_bw_upcalls_send(void *unused) { MFC_LOCK(); bw_upcalls_send(); MFC_UNLOCK(); callout_reset(&bw_upcalls_ch, BW_UPCALLS_PERIOD, expire_bw_upcalls_send, NULL); } /* * A periodic function for periodic scanning of the multicast forwarding * table for processing all "<=" bw_meter entries. */ static void expire_bw_meter_process(void *unused) { if (mrt_api_config & MRT_MFC_BW_UPCALL) bw_meter_process(); callout_reset(&bw_meter_ch, BW_METER_PERIOD, expire_bw_meter_process, NULL); } /* * End of bandwidth monitoring code */ #ifdef PIM /* * Send the packet up to the user daemon, or eventually do kernel encapsulation * */ static int pim_register_send(struct ip *ip, struct vif *vifp, struct mbuf *m, struct mfc *rt) { struct mbuf *mb_copy, *mm; if (mrtdebug & DEBUG_PIM) log(LOG_DEBUG, "pim_register_send: "); mb_copy = pim_register_prepare(ip, m); if (mb_copy == NULL) return ENOBUFS; /* * Send all the fragments. Note that the mbuf for each fragment * is freed by the sending machinery. */ for (mm = mb_copy; mm; mm = mb_copy) { mb_copy = mm->m_nextpkt; mm->m_nextpkt = 0; mm = m_pullup(mm, sizeof(struct ip)); if (mm != NULL) { ip = mtod(mm, struct ip *); if ((mrt_api_config & MRT_MFC_RP) && (rt->mfc_rp.s_addr != INADDR_ANY)) { pim_register_send_rp(ip, vifp, mm, rt); } else { pim_register_send_upcall(ip, vifp, mm, rt); } } } return 0; } /* * Return a copy of the data packet that is ready for PIM Register * encapsulation. * XXX: Note that in the returned copy the IP header is a valid one. */ static struct mbuf * pim_register_prepare(struct ip *ip, struct mbuf *m) { struct mbuf *mb_copy = NULL; int mtu; /* Take care of delayed checksums */ if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { in_delayed_cksum(m); m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; } /* * Copy the old packet & pullup its IP header into the * new mbuf so we can modify it. */ mb_copy = m_copypacket(m, M_DONTWAIT); if (mb_copy == NULL) return NULL; mb_copy = m_pullup(mb_copy, ip->ip_hl << 2); if (mb_copy == NULL) return NULL; /* take care of the TTL */ ip = mtod(mb_copy, struct ip *); --ip->ip_ttl; /* Compute the MTU after the PIM Register encapsulation */ mtu = 0xffff - sizeof(pim_encap_iphdr) - sizeof(pim_encap_pimhdr); if (ip->ip_len <= mtu) { /* Turn the IP header into a valid one */ ip->ip_len = htons(ip->ip_len); ip->ip_off = htons(ip->ip_off); ip->ip_sum = 0; ip->ip_sum = in_cksum(mb_copy, ip->ip_hl << 2); } else { /* Fragment the packet */ if (ip_fragment(ip, &mb_copy, mtu, 0, CSUM_DELAY_IP) != 0) { m_freem(mb_copy); return NULL; } } return mb_copy; } /* * Send an upcall with the data packet to the user-level process. */ static int pim_register_send_upcall(struct ip *ip, struct vif *vifp, struct mbuf *mb_copy, struct mfc *rt) { struct mbuf *mb_first; int len = ntohs(ip->ip_len); struct igmpmsg *im; struct sockaddr_in k_igmpsrc = { sizeof k_igmpsrc, AF_INET }; VIF_LOCK_ASSERT(); /* * Add a new mbuf with an upcall header */ MGETHDR(mb_first, M_DONTWAIT, MT_HEADER); if (mb_first == NULL) { m_freem(mb_copy); return ENOBUFS; } mb_first->m_data += max_linkhdr; mb_first->m_pkthdr.len = len + sizeof(struct igmpmsg); mb_first->m_len = sizeof(struct igmpmsg); mb_first->m_next = mb_copy; /* Send message to routing daemon */ im = mtod(mb_first, struct igmpmsg *); im->im_msgtype = IGMPMSG_WHOLEPKT; im->im_mbz = 0; im->im_vif = vifp - viftable; im->im_src = ip->ip_src; im->im_dst = ip->ip_dst; k_igmpsrc.sin_addr = ip->ip_src; mrtstat.mrts_upcalls++; if (socket_send(ip_mrouter, mb_first, &k_igmpsrc) < 0) { if (mrtdebug & DEBUG_PIM) log(LOG_WARNING, "mcast: pim_register_send_upcall: ip_mrouter socket queue full"); ++mrtstat.mrts_upq_sockfull; return ENOBUFS; } /* Keep statistics */ pimstat.pims_snd_registers_msgs++; pimstat.pims_snd_registers_bytes += len; return 0; } /* * Encapsulate the data packet in PIM Register message and send it to the RP. */ static int pim_register_send_rp(struct ip *ip, struct vif *vifp, struct mbuf *mb_copy, struct mfc *rt) { struct mbuf *mb_first; struct ip *ip_outer; struct pim_encap_pimhdr *pimhdr; int len = ntohs(ip->ip_len); vifi_t vifi = rt->mfc_parent; VIF_LOCK_ASSERT(); if ((vifi >= numvifs) || (viftable[vifi].v_lcl_addr.s_addr == 0)) { m_freem(mb_copy); return EADDRNOTAVAIL; /* The iif vif is invalid */ } /* * Add a new mbuf with the encapsulating header */ MGETHDR(mb_first, M_DONTWAIT, MT_HEADER); if (mb_first == NULL) { m_freem(mb_copy); return ENOBUFS; } mb_first->m_data += max_linkhdr; mb_first->m_len = sizeof(pim_encap_iphdr) + sizeof(pim_encap_pimhdr); mb_first->m_next = mb_copy; mb_first->m_pkthdr.len = len + mb_first->m_len; /* * Fill in the encapsulating IP and PIM header */ ip_outer = mtod(mb_first, struct ip *); *ip_outer = pim_encap_iphdr; #ifdef RANDOM_IP_ID ip_outer->ip_id = ip_randomid(); #else ip_outer->ip_id = htons(ip_id++); #endif ip_outer->ip_len = len + sizeof(pim_encap_iphdr) + sizeof(pim_encap_pimhdr); ip_outer->ip_src = viftable[vifi].v_lcl_addr; ip_outer->ip_dst = rt->mfc_rp; /* * Copy the inner header TOS to the outer header, and take care of the * IP_DF bit. */ ip_outer->ip_tos = ip->ip_tos; if (ntohs(ip->ip_off) & IP_DF) ip_outer->ip_off |= IP_DF; pimhdr = (struct pim_encap_pimhdr *)((caddr_t)ip_outer + sizeof(pim_encap_iphdr)); *pimhdr = pim_encap_pimhdr; /* If the iif crosses a border, set the Border-bit */ if (rt->mfc_flags[vifi] & MRT_MFC_FLAGS_BORDER_VIF & mrt_api_config) pimhdr->flags |= htonl(PIM_BORDER_REGISTER); mb_first->m_data += sizeof(pim_encap_iphdr); pimhdr->pim.pim_cksum = in_cksum(mb_first, sizeof(pim_encap_pimhdr)); mb_first->m_data -= sizeof(pim_encap_iphdr); if (vifp->v_rate_limit == 0) tbf_send_packet(vifp, mb_first); else tbf_control(vifp, mb_first, ip, ip_outer->ip_len); /* Keep statistics */ pimstat.pims_snd_registers_msgs++; pimstat.pims_snd_registers_bytes += len; return 0; } /* * PIM-SMv2 and PIM-DM messages processing. * Receives and verifies the PIM control messages, and passes them * up to the listening socket, using rip_input(). * The only message with special processing is the PIM_REGISTER message * (used by PIM-SM): the PIM header is stripped off, and the inner packet * is passed to if_simloop(). */ void pim_input(struct mbuf *m, int off) { struct ip *ip = mtod(m, struct ip *); struct pim *pim; int minlen; int datalen = ip->ip_len; int ip_tos; int iphlen = off; /* Keep statistics */ pimstat.pims_rcv_total_msgs++; pimstat.pims_rcv_total_bytes += datalen; /* * Validate lengths */ if (datalen < PIM_MINLEN) { pimstat.pims_rcv_tooshort++; log(LOG_ERR, "pim_input: packet size too small %d from %lx\n", datalen, (u_long)ip->ip_src.s_addr); m_freem(m); return; } /* * If the packet is at least as big as a REGISTER, go agead * and grab the PIM REGISTER header size, to avoid another * possible m_pullup() later. * * PIM_MINLEN == pimhdr + u_int32_t == 4 + 4 = 8 * PIM_REG_MINLEN == pimhdr + reghdr + encap_iphdr == 4 + 4 + 20 = 28 */ minlen = iphlen + (datalen >= PIM_REG_MINLEN ? PIM_REG_MINLEN : PIM_MINLEN); /* * Get the IP and PIM headers in contiguous memory, and * possibly the PIM REGISTER header. */ if ((m->m_flags & M_EXT || m->m_len < minlen) && (m = m_pullup(m, minlen)) == 0) { log(LOG_ERR, "pim_input: m_pullup failure\n"); return; } /* m_pullup() may have given us a new mbuf so reset ip. */ ip = mtod(m, struct ip *); ip_tos = ip->ip_tos; /* adjust mbuf to point to the PIM header */ m->m_data += iphlen; m->m_len -= iphlen; pim = mtod(m, struct pim *); /* * Validate checksum. If PIM REGISTER, exclude the data packet. * * XXX: some older PIMv2 implementations don't make this distinction, * so for compatibility reason perform the checksum over part of the * message, and if error, then over the whole message. */ if (PIM_VT_T(pim->pim_vt) == PIM_REGISTER && in_cksum(m, PIM_MINLEN) == 0) { /* do nothing, checksum okay */ } else if (in_cksum(m, datalen)) { pimstat.pims_rcv_badsum++; if (mrtdebug & DEBUG_PIM) log(LOG_DEBUG, "pim_input: invalid checksum"); m_freem(m); return; } /* PIM version check */ if (PIM_VT_V(pim->pim_vt) < PIM_VERSION) { pimstat.pims_rcv_badversion++; log(LOG_ERR, "pim_input: incorrect version %d, expecting %d\n", PIM_VT_V(pim->pim_vt), PIM_VERSION); m_freem(m); return; } /* restore mbuf back to the outer IP */ m->m_data -= iphlen; m->m_len += iphlen; if (PIM_VT_T(pim->pim_vt) == PIM_REGISTER) { /* * Since this is a REGISTER, we'll make a copy of the register * headers ip + pim + u_int32 + encap_ip, to be passed up to the * routing daemon. */ struct sockaddr_in dst = { sizeof(dst), AF_INET }; struct mbuf *mcp; struct ip *encap_ip; u_int32_t *reghdr; struct ifnet *vifp; VIF_LOCK(); if ((reg_vif_num >= numvifs) || (reg_vif_num == VIFI_INVALID)) { VIF_UNLOCK(); if (mrtdebug & DEBUG_PIM) log(LOG_DEBUG, "pim_input: register vif not set: %d\n", reg_vif_num); m_freem(m); return; } /* XXX need refcnt? */ vifp = viftable[reg_vif_num].v_ifp; VIF_UNLOCK(); /* * Validate length */ if (datalen < PIM_REG_MINLEN) { pimstat.pims_rcv_tooshort++; pimstat.pims_rcv_badregisters++; log(LOG_ERR, "pim_input: register packet size too small %d from %lx\n", datalen, (u_long)ip->ip_src.s_addr); m_freem(m); return; } reghdr = (u_int32_t *)(pim + 1); encap_ip = (struct ip *)(reghdr + 1); if (mrtdebug & DEBUG_PIM) { log(LOG_DEBUG, "pim_input[register], encap_ip: %lx -> %lx, encap_ip len %d\n", (u_long)ntohl(encap_ip->ip_src.s_addr), (u_long)ntohl(encap_ip->ip_dst.s_addr), ntohs(encap_ip->ip_len)); } /* verify the version number of the inner packet */ if (encap_ip->ip_v != IPVERSION) { pimstat.pims_rcv_badregisters++; if (mrtdebug & DEBUG_PIM) { log(LOG_DEBUG, "pim_input: invalid IP version (%d) " "of the inner packet\n", encap_ip->ip_v); } m_freem(m); return; } /* verify the inner packet is destined to a mcast group */ if (!IN_MULTICAST(ntohl(encap_ip->ip_dst.s_addr))) { pimstat.pims_rcv_badregisters++; if (mrtdebug & DEBUG_PIM) log(LOG_DEBUG, "pim_input: inner packet of register is not " "multicast %lx\n", (u_long)ntohl(encap_ip->ip_dst.s_addr)); m_freem(m); return; } /* If a NULL_REGISTER, pass it to the daemon */ if ((ntohl(*reghdr) & PIM_NULL_REGISTER)) goto pim_input_to_daemon; /* * Copy the TOS from the outer IP header to the inner IP header. */ if (encap_ip->ip_tos != ip_tos) { /* Outer TOS -> inner TOS */ encap_ip->ip_tos = ip_tos; /* Recompute the inner header checksum. Sigh... */ /* adjust mbuf to point to the inner IP header */ m->m_data += (iphlen + PIM_MINLEN); m->m_len -= (iphlen + PIM_MINLEN); encap_ip->ip_sum = 0; encap_ip->ip_sum = in_cksum(m, encap_ip->ip_hl << 2); /* restore mbuf to point back to the outer IP header */ m->m_data -= (iphlen + PIM_MINLEN); m->m_len += (iphlen + PIM_MINLEN); } /* * Decapsulate the inner IP packet and loopback to forward it * as a normal multicast packet. Also, make a copy of the * outer_iphdr + pimhdr + reghdr + encap_iphdr * to pass to the daemon later, so it can take the appropriate * actions (e.g., send back PIM_REGISTER_STOP). * XXX: here m->m_data points to the outer IP header. */ mcp = m_copy(m, 0, iphlen + PIM_REG_MINLEN); if (mcp == NULL) { log(LOG_ERR, "pim_input: pim register: could not copy register head\n"); m_freem(m); return; } /* Keep statistics */ /* XXX: registers_bytes include only the encap. mcast pkt */ pimstat.pims_rcv_registers_msgs++; pimstat.pims_rcv_registers_bytes += ntohs(encap_ip->ip_len); /* * forward the inner ip packet; point m_data at the inner ip. */ m_adj(m, iphlen + PIM_MINLEN); if (mrtdebug & DEBUG_PIM) { log(LOG_DEBUG, "pim_input: forwarding decapsulated register: " "src %lx, dst %lx, vif %d\n", (u_long)ntohl(encap_ip->ip_src.s_addr), (u_long)ntohl(encap_ip->ip_dst.s_addr), reg_vif_num); } /* NB: vifp was collected above; can it change on us? */ if_simloop(vifp, m, dst.sin_family, 0); /* prepare the register head to send to the mrouting daemon */ m = mcp; } pim_input_to_daemon: /* * Pass the PIM message up to the daemon; if it is a Register message, * pass the 'head' only up to the daemon. This includes the * outer IP header, PIM header, PIM-Register header and the * inner IP header. * XXX: the outer IP header pkt size of a Register is not adjust to * reflect the fact that the inner multicast data is truncated. */ rip_input(m, iphlen); return; } #endif /* PIM */ static int ip_mroute_modevent(module_t mod, int type, void *unused) { switch (type) { case MOD_LOAD: mtx_init(&mrouter_mtx, "mrouter initialization", NULL, MTX_DEF); MFC_LOCK_INIT(); VIF_LOCK_INIT(); ip_mrouter_reset(); ip_mcast_src = X_ip_mcast_src; ip_mforward = X_ip_mforward; ip_mrouter_done = X_ip_mrouter_done; ip_mrouter_get = X_ip_mrouter_get; ip_mrouter_set = X_ip_mrouter_set; ip_rsvp_force_done = X_ip_rsvp_force_done; ip_rsvp_vif = X_ip_rsvp_vif; legal_vif_num = X_legal_vif_num; mrt_ioctl = X_mrt_ioctl; rsvp_input_p = X_rsvp_input; break; case MOD_UNLOAD: /* * Typically module unload happens after the user-level * process has shutdown the kernel services (the check * below insures someone can't just yank the module out * from under a running process). But if the module is * just loaded and then unloaded w/o starting up a user * process we still need to cleanup. */ if (ip_mrouter) return EINVAL; X_ip_mrouter_done(); ip_mcast_src = NULL; ip_mforward = NULL; ip_mrouter_done = NULL; ip_mrouter_get = NULL; ip_mrouter_set = NULL; ip_rsvp_force_done = NULL; ip_rsvp_vif = NULL; legal_vif_num = NULL; mrt_ioctl = NULL; rsvp_input_p = NULL; VIF_LOCK_DESTROY(); MFC_LOCK_DESTROY(); mtx_destroy(&mrouter_mtx); break; } return 0; } static moduledata_t ip_mroutemod = { "ip_mroute", ip_mroute_modevent, 0 }; DECLARE_MODULE(ip_mroute, ip_mroutemod, SI_SUB_PSEUDO, SI_ORDER_ANY); Index: head/sys/netinet/raw_ip.c =================================================================== --- head/sys/netinet/raw_ip.c (revision 131150) +++ head/sys/netinet/raw_ip.c (revision 131151) @@ -1,876 +1,880 @@ /* * Copyright (c) 1982, 1986, 1988, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)raw_ip.c 8.7 (Berkeley) 5/15/95 * $FreeBSD$ */ #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_mac.h" #include "opt_random_ip_id.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef FAST_IPSEC #include #endif /*FAST_IPSEC*/ #ifdef IPSEC #include #endif /*IPSEC*/ struct inpcbhead ripcb; struct inpcbinfo ripcbinfo; /* control hooks for ipfw and dummynet */ ip_fw_ctl_t *ip_fw_ctl_ptr; ip_dn_ctl_t *ip_dn_ctl_ptr; /* * hooks for multicast routing. They all default to NULL, * so leave them not initialized and rely on BSS being set to 0. */ /* The socket used to communicate with the multicast routing daemon. */ struct socket *ip_mrouter; /* The various mrouter and rsvp functions */ int (*ip_mrouter_set)(struct socket *, struct sockopt *); int (*ip_mrouter_get)(struct socket *, struct sockopt *); int (*ip_mrouter_done)(void); int (*ip_mforward)(struct ip *, struct ifnet *, struct mbuf *, struct ip_moptions *); int (*mrt_ioctl)(int, caddr_t); int (*legal_vif_num)(int); u_long (*ip_mcast_src)(int); void (*rsvp_input_p)(struct mbuf *m, int off); int (*ip_rsvp_vif)(struct socket *, struct sockopt *); void (*ip_rsvp_force_done)(struct socket *); /* * Nominal space allocated to a raw ip socket. */ #define RIPSNDQ 8192 #define RIPRCVQ 8192 /* * Raw interface to IP protocol. */ /* * Initialize raw connection block q. */ void rip_init() { INP_INFO_LOCK_INIT(&ripcbinfo, "rip"); LIST_INIT(&ripcb); ripcbinfo.listhead = &ripcb; /* * XXX We don't use the hash list for raw IP, but it's easier * to allocate a one entry hash list than it is to check all * over the place for hashbase == NULL. */ ripcbinfo.hashbase = hashinit(1, M_PCB, &ripcbinfo.hashmask); ripcbinfo.porthashbase = hashinit(1, M_PCB, &ripcbinfo.porthashmask); ripcbinfo.ipi_zone = uma_zcreate("ripcb", sizeof(struct inpcb), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); uma_zone_set_max(ripcbinfo.ipi_zone, maxsockets); } static struct sockaddr_in ripsrc = { sizeof(ripsrc), AF_INET }; static int raw_append(struct inpcb *last, struct ip *ip, struct mbuf *n) { int policyfail = 0; INP_LOCK_ASSERT(last); #if defined(IPSEC) || defined(FAST_IPSEC) /* check AH/ESP integrity. */ if (ipsec4_in_reject(n, last)) { policyfail = 1; #ifdef IPSEC ipsecstat.in_polvio++; #endif /*IPSEC*/ /* do not inject data to pcb */ } #endif /*IPSEC || FAST_IPSEC*/ #ifdef MAC if (!policyfail && mac_check_inpcb_deliver(last, n) != 0) policyfail = 1; #endif if (!policyfail) { struct mbuf *opts = NULL; + struct socket *so; + so = last->inp_socket; if ((last->inp_flags & INP_CONTROLOPTS) || - (last->inp_socket->so_options & SO_TIMESTAMP)) + (so->so_options & SO_TIMESTAMP)) ip_savecontrol(last, &opts, ip, n); - if (sbappendaddr(&last->inp_socket->so_rcv, + SOCKBUF_LOCK(&so->so_rcv); + if (sbappendaddr_locked(&so->so_rcv, (struct sockaddr *)&ripsrc, n, opts) == 0) { /* should notify about lost packet */ m_freem(n); if (opts) m_freem(opts); + SOCKBUF_UNLOCK(&so->so_rcv); } else - sorwakeup(last->inp_socket); + sorwakeup_locked(so); } else m_freem(n); return policyfail; } /* * Setup generic address and protocol structures * for raw_input routine, then pass them along with * mbuf chain. */ void rip_input(struct mbuf *m, int off) { struct ip *ip = mtod(m, struct ip *); int proto = ip->ip_p; struct inpcb *inp, *last; INP_INFO_RLOCK(&ripcbinfo); ripsrc.sin_addr = ip->ip_src; last = NULL; LIST_FOREACH(inp, &ripcb, inp_list) { INP_LOCK(inp); if (inp->inp_ip_p && inp->inp_ip_p != proto) { docontinue: INP_UNLOCK(inp); continue; } #ifdef INET6 if ((inp->inp_vflag & INP_IPV4) == 0) goto docontinue; #endif if (inp->inp_laddr.s_addr && inp->inp_laddr.s_addr != ip->ip_dst.s_addr) goto docontinue; if (inp->inp_faddr.s_addr && inp->inp_faddr.s_addr != ip->ip_src.s_addr) goto docontinue; if (jailed(inp->inp_socket->so_cred)) if (htonl(prison_getip(inp->inp_socket->so_cred)) != ip->ip_dst.s_addr) goto docontinue; if (last) { struct mbuf *n; n = m_copy(m, 0, (int)M_COPYALL); if (n != NULL) (void) raw_append(last, ip, n); /* XXX count dropped packet */ INP_UNLOCK(last); } last = inp; } if (last != NULL) { if (raw_append(last, ip, m) != 0) ipstat.ips_delivered--; INP_UNLOCK(last); } else { m_freem(m); ipstat.ips_noproto++; ipstat.ips_delivered--; } INP_INFO_RUNLOCK(&ripcbinfo); } /* * Generate IP header and pass packet to ip_output. * Tack on options user may have setup with control call. */ int rip_output(struct mbuf *m, struct socket *so, u_long dst) { struct ip *ip; int error; struct inpcb *inp = sotoinpcb(so); int flags = (so->so_options & SO_DONTROUTE) | IP_ALLOWBROADCAST; /* * If the user handed us a complete IP packet, use it. * Otherwise, allocate an mbuf for a header and fill it in. */ if ((inp->inp_flags & INP_HDRINCL) == 0) { if (m->m_pkthdr.len + sizeof(struct ip) > IP_MAXPACKET) { m_freem(m); return(EMSGSIZE); } M_PREPEND(m, sizeof(struct ip), M_TRYWAIT); if (m == NULL) return(ENOBUFS); INP_LOCK(inp); ip = mtod(m, struct ip *); ip->ip_tos = inp->inp_ip_tos; ip->ip_off = 0; ip->ip_p = inp->inp_ip_p; ip->ip_len = m->m_pkthdr.len; if (jailed(inp->inp_socket->so_cred)) ip->ip_src.s_addr = htonl(prison_getip(inp->inp_socket->so_cred)); else ip->ip_src = inp->inp_laddr; ip->ip_dst.s_addr = dst; ip->ip_ttl = inp->inp_ip_ttl; } else { if (m->m_pkthdr.len > IP_MAXPACKET) { m_freem(m); return(EMSGSIZE); } INP_LOCK(inp); ip = mtod(m, struct ip *); if (jailed(inp->inp_socket->so_cred)) { if (ip->ip_src.s_addr != htonl(prison_getip(inp->inp_socket->so_cred))) { INP_UNLOCK(inp); m_freem(m); return (EPERM); } } /* don't allow both user specified and setsockopt options, and don't allow packet length sizes that will crash */ if (((ip->ip_hl != (sizeof (*ip) >> 2)) && inp->inp_options) || (ip->ip_len > m->m_pkthdr.len) || (ip->ip_len < (ip->ip_hl << 2))) { INP_UNLOCK(inp); m_freem(m); return EINVAL; } if (ip->ip_id == 0) #ifdef RANDOM_IP_ID ip->ip_id = ip_randomid(); #else ip->ip_id = htons(ip_id++); #endif /* XXX prevent ip_output from overwriting header fields */ flags |= IP_RAWOUTPUT; ipstat.ips_rawout++; } if (inp->inp_flags & INP_ONESBCAST) flags |= IP_SENDONES; #ifdef MAC mac_create_mbuf_from_inpcb(inp, m); #endif error = ip_output(m, inp->inp_options, NULL, flags, inp->inp_moptions, inp); INP_UNLOCK(inp); return error; } /* * Raw IP socket option processing. * * Note that access to all of the IP administrative functions here is * implicitly protected by suser() as gaining access to a raw socket * requires either that the thread pass a suser() check, or that it be * passed a raw socket by another thread that has passed a suser() check. * If FreeBSD moves to a more fine-grained access control mechanism, * additional checks will need to be placed here if the raw IP attachment * check is not equivilent the the check required for these * administrative operations; in some cases, these checks are already * present. */ int rip_ctloutput(struct socket *so, struct sockopt *sopt) { struct inpcb *inp = sotoinpcb(so); int error, optval; if (sopt->sopt_level != IPPROTO_IP) return (EINVAL); error = 0; switch (sopt->sopt_dir) { case SOPT_GET: switch (sopt->sopt_name) { case IP_HDRINCL: optval = inp->inp_flags & INP_HDRINCL; error = sooptcopyout(sopt, &optval, sizeof optval); break; case IP_FW_ADD: /* ADD actually returns the body... */ case IP_FW_GET: case IP_FW_TABLE_GETSIZE: case IP_FW_TABLE_LIST: if (IPFW_LOADED) error = ip_fw_ctl_ptr(sopt); else error = ENOPROTOOPT; break; case IP_DUMMYNET_GET: if (DUMMYNET_LOADED) error = ip_dn_ctl_ptr(sopt); else error = ENOPROTOOPT; break ; case MRT_INIT: case MRT_DONE: case MRT_ADD_VIF: case MRT_DEL_VIF: case MRT_ADD_MFC: case MRT_DEL_MFC: case MRT_VERSION: case MRT_ASSERT: case MRT_API_SUPPORT: case MRT_API_CONFIG: case MRT_ADD_BW_UPCALL: case MRT_DEL_BW_UPCALL: error = ip_mrouter_get ? ip_mrouter_get(so, sopt) : EOPNOTSUPP; break; default: error = ip_ctloutput(so, sopt); break; } break; case SOPT_SET: switch (sopt->sopt_name) { case IP_HDRINCL: error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) break; if (optval) inp->inp_flags |= INP_HDRINCL; else inp->inp_flags &= ~INP_HDRINCL; break; case IP_FW_ADD: case IP_FW_DEL: case IP_FW_FLUSH: case IP_FW_ZERO: case IP_FW_RESETLOG: case IP_FW_TABLE_ADD: case IP_FW_TABLE_DEL: case IP_FW_TABLE_FLUSH: if (IPFW_LOADED) error = ip_fw_ctl_ptr(sopt); else error = ENOPROTOOPT; break; case IP_DUMMYNET_CONFIGURE: case IP_DUMMYNET_DEL: case IP_DUMMYNET_FLUSH: if (DUMMYNET_LOADED) error = ip_dn_ctl_ptr(sopt); else error = ENOPROTOOPT ; break ; case IP_RSVP_ON: error = ip_rsvp_init(so); break; case IP_RSVP_OFF: error = ip_rsvp_done(); break; case IP_RSVP_VIF_ON: case IP_RSVP_VIF_OFF: error = ip_rsvp_vif ? ip_rsvp_vif(so, sopt) : EINVAL; break; case MRT_INIT: case MRT_DONE: case MRT_ADD_VIF: case MRT_DEL_VIF: case MRT_ADD_MFC: case MRT_DEL_MFC: case MRT_VERSION: case MRT_ASSERT: case MRT_API_SUPPORT: case MRT_API_CONFIG: case MRT_ADD_BW_UPCALL: case MRT_DEL_BW_UPCALL: error = ip_mrouter_set ? ip_mrouter_set(so, sopt) : EOPNOTSUPP; break; default: error = ip_ctloutput(so, sopt); break; } break; } return (error); } /* * This function exists solely to receive the PRC_IFDOWN messages which * are sent by if_down(). It looks for an ifaddr whose ifa_addr is sa, * and calls in_ifadown() to remove all routes corresponding to that address. * It also receives the PRC_IFUP messages from if_up() and reinstalls the * interface routes. */ void rip_ctlinput(int cmd, struct sockaddr *sa, void *vip) { struct in_ifaddr *ia; struct ifnet *ifp; int err; int flags; switch (cmd) { case PRC_IFDOWN: TAILQ_FOREACH(ia, &in_ifaddrhead, ia_link) { if (ia->ia_ifa.ifa_addr == sa && (ia->ia_flags & IFA_ROUTE)) { /* * in_ifscrub kills the interface route. */ in_ifscrub(ia->ia_ifp, ia); /* * in_ifadown gets rid of all the rest of * the routes. This is not quite the right * thing to do, but at least if we are running * a routing process they will come back. */ in_ifadown(&ia->ia_ifa, 0); break; } } break; case PRC_IFUP: TAILQ_FOREACH(ia, &in_ifaddrhead, ia_link) { if (ia->ia_ifa.ifa_addr == sa) break; } if (ia == 0 || (ia->ia_flags & IFA_ROUTE)) return; flags = RTF_UP; ifp = ia->ia_ifa.ifa_ifp; if ((ifp->if_flags & IFF_LOOPBACK) || (ifp->if_flags & IFF_POINTOPOINT)) flags |= RTF_HOST; err = rtinit(&ia->ia_ifa, RTM_ADD, flags); if (err == 0) ia->ia_flags |= IFA_ROUTE; break; } } u_long rip_sendspace = RIPSNDQ; u_long rip_recvspace = RIPRCVQ; SYSCTL_INT(_net_inet_raw, OID_AUTO, maxdgram, CTLFLAG_RW, &rip_sendspace, 0, "Maximum outgoing raw IP datagram size"); SYSCTL_INT(_net_inet_raw, OID_AUTO, recvspace, CTLFLAG_RW, &rip_recvspace, 0, "Maximum space for incoming raw IP datagrams"); static int rip_attach(struct socket *so, int proto, struct thread *td) { struct inpcb *inp; int error; /* XXX why not lower? */ INP_INFO_WLOCK(&ripcbinfo); inp = sotoinpcb(so); if (inp) { /* XXX counter, printf */ INP_INFO_WUNLOCK(&ripcbinfo); return EINVAL; } if (td && jailed(td->td_ucred) && !jail_allow_raw_sockets) { INP_INFO_WUNLOCK(&ripcbinfo); return (EPERM); } if (td && (error = suser_cred(td->td_ucred, PRISON_ROOT)) != 0) { INP_INFO_WUNLOCK(&ripcbinfo); return error; } if (proto >= IPPROTO_MAX || proto < 0) { INP_INFO_WUNLOCK(&ripcbinfo); return EPROTONOSUPPORT; } error = soreserve(so, rip_sendspace, rip_recvspace); if (error) { INP_INFO_WUNLOCK(&ripcbinfo); return error; } error = in_pcballoc(so, &ripcbinfo, "rawinp"); if (error) { INP_INFO_WUNLOCK(&ripcbinfo); return error; } inp = (struct inpcb *)so->so_pcb; INP_LOCK(inp); INP_INFO_WUNLOCK(&ripcbinfo); inp->inp_vflag |= INP_IPV4; inp->inp_ip_p = proto; inp->inp_ip_ttl = ip_defttl; INP_UNLOCK(inp); return 0; } static void rip_pcbdetach(struct socket *so, struct inpcb *inp) { INP_INFO_WLOCK_ASSERT(&ripcbinfo); INP_LOCK_ASSERT(inp); if (so == ip_mrouter && ip_mrouter_done) ip_mrouter_done(); if (ip_rsvp_force_done) ip_rsvp_force_done(so); if (so == ip_rsvpd) ip_rsvp_done(); in_pcbdetach(inp); } static int rip_detach(struct socket *so) { struct inpcb *inp; INP_INFO_WLOCK(&ripcbinfo); inp = sotoinpcb(so); if (inp == 0) { /* XXX counter, printf */ INP_INFO_WUNLOCK(&ripcbinfo); return EINVAL; } INP_LOCK(inp); rip_pcbdetach(so, inp); INP_INFO_WUNLOCK(&ripcbinfo); return 0; } static int rip_abort(struct socket *so) { struct inpcb *inp; INP_INFO_WLOCK(&ripcbinfo); inp = sotoinpcb(so); if (inp == 0) { INP_INFO_WUNLOCK(&ripcbinfo); return EINVAL; /* ??? possible? panic instead? */ } INP_LOCK(inp); soisdisconnected(so); if (so->so_state & SS_NOFDREF) rip_pcbdetach(so, inp); else INP_UNLOCK(inp); INP_INFO_WUNLOCK(&ripcbinfo); return 0; } static int rip_disconnect(struct socket *so) { if ((so->so_state & SS_ISCONNECTED) == 0) return ENOTCONN; return rip_abort(so); } static int rip_bind(struct socket *so, struct sockaddr *nam, struct thread *td) { struct sockaddr_in *addr = (struct sockaddr_in *)nam; struct inpcb *inp; if (nam->sa_len != sizeof(*addr)) return EINVAL; if (jailed(td->td_ucred)) { if (addr->sin_addr.s_addr == INADDR_ANY) addr->sin_addr.s_addr = htonl(prison_getip(td->td_ucred)); if (htonl(prison_getip(td->td_ucred)) != addr->sin_addr.s_addr) return (EADDRNOTAVAIL); } if (TAILQ_EMPTY(&ifnet) || (addr->sin_family != AF_INET && addr->sin_family != AF_IMPLINK) || (addr->sin_addr.s_addr && ifa_ifwithaddr((struct sockaddr *)addr) == 0)) return EADDRNOTAVAIL; INP_INFO_WLOCK(&ripcbinfo); inp = sotoinpcb(so); if (inp == 0) { INP_INFO_WUNLOCK(&ripcbinfo); return EINVAL; } INP_LOCK(inp); inp->inp_laddr = addr->sin_addr; INP_UNLOCK(inp); INP_INFO_WUNLOCK(&ripcbinfo); return 0; } static int rip_connect(struct socket *so, struct sockaddr *nam, struct thread *td) { struct sockaddr_in *addr = (struct sockaddr_in *)nam; struct inpcb *inp; if (nam->sa_len != sizeof(*addr)) return EINVAL; if (TAILQ_EMPTY(&ifnet)) return EADDRNOTAVAIL; if (addr->sin_family != AF_INET && addr->sin_family != AF_IMPLINK) return EAFNOSUPPORT; INP_INFO_WLOCK(&ripcbinfo); inp = sotoinpcb(so); if (inp == 0) { INP_INFO_WUNLOCK(&ripcbinfo); return EINVAL; } INP_LOCK(inp); inp->inp_faddr = addr->sin_addr; soisconnected(so); INP_UNLOCK(inp); INP_INFO_WUNLOCK(&ripcbinfo); return 0; } static int rip_shutdown(struct socket *so) { struct inpcb *inp; INP_INFO_RLOCK(&ripcbinfo); inp = sotoinpcb(so); if (inp == 0) { INP_INFO_RUNLOCK(&ripcbinfo); return EINVAL; } INP_LOCK(inp); INP_INFO_RUNLOCK(&ripcbinfo); socantsendmore(so); INP_UNLOCK(inp); return 0; } static int rip_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, struct mbuf *control, struct thread *td) { struct inpcb *inp; u_long dst; int ret; INP_INFO_WLOCK(&ripcbinfo); inp = sotoinpcb(so); if (so->so_state & SS_ISCONNECTED) { if (nam) { INP_INFO_WUNLOCK(&ripcbinfo); m_freem(m); return EISCONN; } dst = inp->inp_faddr.s_addr; } else { if (nam == NULL) { INP_INFO_WUNLOCK(&ripcbinfo); m_freem(m); return ENOTCONN; } dst = ((struct sockaddr_in *)nam)->sin_addr.s_addr; } ret = rip_output(m, so, dst); INP_INFO_WUNLOCK(&ripcbinfo); return ret; } static int rip_pcblist(SYSCTL_HANDLER_ARGS) { int error, i, n; struct inpcb *inp, **inp_list; inp_gen_t gencnt; struct xinpgen xig; /* * The process of preparing the TCB list is too time-consuming and * resource-intensive to repeat twice on every request. */ if (req->oldptr == 0) { n = ripcbinfo.ipi_count; req->oldidx = 2 * (sizeof xig) + (n + n/8) * sizeof(struct xinpcb); return 0; } if (req->newptr != 0) return EPERM; /* * OK, now we're committed to doing something. */ INP_INFO_RLOCK(&ripcbinfo); gencnt = ripcbinfo.ipi_gencnt; n = ripcbinfo.ipi_count; INP_INFO_RUNLOCK(&ripcbinfo); xig.xig_len = sizeof xig; xig.xig_count = n; xig.xig_gen = gencnt; xig.xig_sogen = so_gencnt; error = SYSCTL_OUT(req, &xig, sizeof xig); if (error) return error; inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK); if (inp_list == 0) return ENOMEM; INP_INFO_RLOCK(&ripcbinfo); for (inp = LIST_FIRST(ripcbinfo.listhead), i = 0; inp && i < n; inp = LIST_NEXT(inp, inp_list)) { INP_LOCK(inp); if (inp->inp_gencnt <= gencnt && cr_canseesocket(req->td->td_ucred, inp->inp_socket) == 0) { /* XXX held references? */ inp_list[i++] = inp; } INP_UNLOCK(inp); } INP_INFO_RUNLOCK(&ripcbinfo); n = i; error = 0; for (i = 0; i < n; i++) { inp = inp_list[i]; if (inp->inp_gencnt <= gencnt) { struct xinpcb xi; xi.xi_len = sizeof xi; /* XXX should avoid extra copy */ bcopy(inp, &xi.xi_inp, sizeof *inp); if (inp->inp_socket) sotoxsocket(inp->inp_socket, &xi.xi_socket); error = SYSCTL_OUT(req, &xi, sizeof xi); } } if (!error) { /* * Give the user an updated idea of our state. * If the generation differs from what we told * her before, she knows that something happened * while we were processing this request, and it * might be necessary to retry. */ INP_INFO_RLOCK(&ripcbinfo); xig.xig_gen = ripcbinfo.ipi_gencnt; xig.xig_sogen = so_gencnt; xig.xig_count = ripcbinfo.ipi_count; INP_INFO_RUNLOCK(&ripcbinfo); error = SYSCTL_OUT(req, &xig, sizeof xig); } free(inp_list, M_TEMP); return error; } /* * This is the wrapper function for in_setsockaddr. We just pass down * the pcbinfo for in_setpeeraddr to lock. */ static int rip_sockaddr(struct socket *so, struct sockaddr **nam) { return (in_setsockaddr(so, nam, &ripcbinfo)); } /* * This is the wrapper function for in_setpeeraddr. We just pass down * the pcbinfo for in_setpeeraddr to lock. */ static int rip_peeraddr(struct socket *so, struct sockaddr **nam) { return (in_setpeeraddr(so, nam, &ripcbinfo)); } SYSCTL_PROC(_net_inet_raw, OID_AUTO/*XXX*/, pcblist, CTLFLAG_RD, 0, 0, rip_pcblist, "S,xinpcb", "List of active raw IP sockets"); struct pr_usrreqs rip_usrreqs = { rip_abort, pru_accept_notsupp, rip_attach, rip_bind, rip_connect, pru_connect2_notsupp, in_control, rip_detach, rip_disconnect, pru_listen_notsupp, rip_peeraddr, pru_rcvd_notsupp, pru_rcvoob_notsupp, rip_send, pru_sense_null, rip_shutdown, rip_sockaddr, sosend, soreceive, sopoll, in_pcbsosetlabel }; Index: head/sys/netinet/tcp_input.c =================================================================== --- head/sys/netinet/tcp_input.c (revision 131150) +++ head/sys/netinet/tcp_input.c (revision 131151) @@ -1,3315 +1,3320 @@ /* * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)tcp_input.c 8.12 (Berkeley) 5/24/95 * $FreeBSD$ */ #include "opt_ipfw.h" /* for ipfw_fwd */ #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_mac.h" #include "opt_tcpdebug.h" #include "opt_tcp_input.h" #include "opt_tcp_sack.h" #include #include #include #include #include #include /* for proc0 declaration */ #include #include #include #include #include #include #include #include /* before tcp_seq.h, for tcp_random18() */ #include #include #include #include #include #include #include #include #include /* for ICMP_BANDLIM */ #include /* for ICMP_BANDLIM */ #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef TCPDEBUG #include #endif /* TCPDEBUG */ #ifdef FAST_IPSEC #include #include #endif /*FAST_IPSEC*/ #ifdef IPSEC #include #include #include #endif /*IPSEC*/ #include static const int tcprexmtthresh = 3; tcp_cc tcp_ccgen; struct tcpstat tcpstat; SYSCTL_STRUCT(_net_inet_tcp, TCPCTL_STATS, stats, CTLFLAG_RW, &tcpstat , tcpstat, "TCP statistics (struct tcpstat, netinet/tcp_var.h)"); static int log_in_vain = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_in_vain, CTLFLAG_RW, &log_in_vain, 0, "Log all incoming TCP connections"); static int blackhole = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, blackhole, CTLFLAG_RW, &blackhole, 0, "Do not send RST when dropping refused connections"); int tcp_delack_enabled = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, delayed_ack, CTLFLAG_RW, &tcp_delack_enabled, 0, "Delay ACK to try and piggyback it onto a data packet"); #ifdef TCP_DROP_SYNFIN static int drop_synfin = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, drop_synfin, CTLFLAG_RW, &drop_synfin, 0, "Drop TCP packets with SYN+FIN set"); #endif static int tcp_do_rfc3042 = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3042, CTLFLAG_RW, &tcp_do_rfc3042, 0, "Enable RFC 3042 (Limited Transmit)"); static int tcp_do_rfc3390 = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3390, CTLFLAG_RW, &tcp_do_rfc3390, 0, "Enable RFC 3390 (Increasing TCP's Initial Congestion Window)"); SYSCTL_NODE(_net_inet_tcp, OID_AUTO, reass, CTLFLAG_RW, 0, "TCP Segment Reassembly Queue"); static int tcp_reass_maxseg = 0; SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, maxsegments, CTLFLAG_RDTUN, &tcp_reass_maxseg, 0, "Global maximum number of TCP Segments in Reassembly Queue"); int tcp_reass_qsize = 0; SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, cursegments, CTLFLAG_RD, &tcp_reass_qsize, 0, "Global number of TCP Segments currently in Reassembly Queue"); static int tcp_reass_maxqlen = 48; SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, maxqlen, CTLFLAG_RW, &tcp_reass_maxqlen, 0, "Maximum number of TCP Segments per individual Reassembly Queue"); static int tcp_reass_overflows = 0; SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, overflows, CTLFLAG_RD, &tcp_reass_overflows, 0, "Global number of TCP Segment Reassembly Queue Overflows"); struct inpcbhead tcb; #define tcb6 tcb /* for KAME src sync over BSD*'s */ struct inpcbinfo tcbinfo; struct mtx *tcbinfo_mtx; static void tcp_dooptions(struct tcpcb *, struct tcpopt *, u_char *, int, int, struct tcphdr *); static void tcp_pulloutofband(struct socket *, struct tcphdr *, struct mbuf *, int); static int tcp_reass(struct tcpcb *, struct tcphdr *, int *, struct mbuf *); static void tcp_xmit_timer(struct tcpcb *, int); static void tcp_newreno_partial_ack(struct tcpcb *, struct tcphdr *); static int tcp_timewait(struct tcptw *, struct tcpopt *, struct tcphdr *, struct mbuf *, int); /* Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint. */ #ifdef INET6 #define ND6_HINT(tp) \ do { \ if ((tp) && (tp)->t_inpcb && \ ((tp)->t_inpcb->inp_vflag & INP_IPV6) != 0) \ nd6_nud_hint(NULL, NULL, 0); \ } while (0) #else #define ND6_HINT(tp) #endif /* * Indicate whether this ack should be delayed. We can delay the ack if * - there is no delayed ack timer in progress and * - our last ack wasn't a 0-sized window. We never want to delay * the ack that opens up a 0-sized window and * - delayed acks are enabled or * - this is a half-synchronized T/TCP connection. */ #define DELAY_ACK(tp) \ ((!callout_active(tp->tt_delack) && \ (tp->t_flags & TF_RXWIN0SENT) == 0) && \ (tcp_delack_enabled || (tp->t_flags & TF_NEEDSYN))) /* Initialize TCP reassembly queue */ uma_zone_t tcp_reass_zone; void tcp_reass_init() { tcp_reass_maxseg = nmbclusters / 16; TUNABLE_INT_FETCH("net.inet.tcp.reass.maxsegments", &tcp_reass_maxseg); tcp_reass_zone = uma_zcreate("tcpreass", sizeof (struct tseg_qent), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); uma_zone_set_max(tcp_reass_zone, tcp_reass_maxseg); } static int tcp_reass(tp, th, tlenp, m) register struct tcpcb *tp; register struct tcphdr *th; int *tlenp; struct mbuf *m; { struct tseg_qent *q; struct tseg_qent *p = NULL; struct tseg_qent *nq; struct tseg_qent *te = NULL; struct socket *so = tp->t_inpcb->inp_socket; int flags; /* * XXX: tcp_reass() is rather inefficient with its data structures * and should be rewritten (see NetBSD for optimizations). While * doing that it should move to its own file tcp_reass.c. */ /* * Call with th==0 after become established to * force pre-ESTABLISHED data up to user socket. */ if (th == 0) goto present; /* * Limit the number of segments in the reassembly queue to prevent * holding on to too many segments (and thus running out of mbufs). * Make sure to let the missing segment through which caused this * queue. Always keep one global queue entry spare to be able to * process the missing segment. */ if (th->th_seq != tp->rcv_nxt && (tcp_reass_qsize + 1 >= tcp_reass_maxseg || tp->t_segqlen >= tcp_reass_maxqlen)) { tcp_reass_overflows++; tcpstat.tcps_rcvmemdrop++; m_freem(m); return (0); } /* * Allocate a new queue entry. If we can't, or hit the zone limit * just drop the pkt. */ te = uma_zalloc(tcp_reass_zone, M_NOWAIT); if (te == NULL) { tcpstat.tcps_rcvmemdrop++; m_freem(m); return (0); } tp->t_segqlen++; tcp_reass_qsize++; /* * Find a segment which begins after this one does. */ LIST_FOREACH(q, &tp->t_segq, tqe_q) { if (SEQ_GT(q->tqe_th->th_seq, th->th_seq)) break; p = q; } /* * If there is a preceding segment, it may provide some of * our data already. If so, drop the data from the incoming * segment. If it provides all of our data, drop us. */ if (p != NULL) { register int i; /* conversion to int (in i) handles seq wraparound */ i = p->tqe_th->th_seq + p->tqe_len - th->th_seq; if (i > 0) { if (i >= *tlenp) { tcpstat.tcps_rcvduppack++; tcpstat.tcps_rcvdupbyte += *tlenp; m_freem(m); uma_zfree(tcp_reass_zone, te); tp->t_segqlen--; tcp_reass_qsize--; /* * Try to present any queued data * at the left window edge to the user. * This is needed after the 3-WHS * completes. */ goto present; /* ??? */ } m_adj(m, i); *tlenp -= i; th->th_seq += i; } } tcpstat.tcps_rcvoopack++; tcpstat.tcps_rcvoobyte += *tlenp; /* * While we overlap succeeding segments trim them or, * if they are completely covered, dequeue them. */ while (q) { register int i = (th->th_seq + *tlenp) - q->tqe_th->th_seq; if (i <= 0) break; if (i < q->tqe_len) { q->tqe_th->th_seq += i; q->tqe_len -= i; m_adj(q->tqe_m, i); break; } nq = LIST_NEXT(q, tqe_q); LIST_REMOVE(q, tqe_q); m_freem(q->tqe_m); uma_zfree(tcp_reass_zone, q); tp->t_segqlen--; tcp_reass_qsize--; q = nq; } /* Insert the new segment queue entry into place. */ te->tqe_m = m; te->tqe_th = th; te->tqe_len = *tlenp; if (p == NULL) { LIST_INSERT_HEAD(&tp->t_segq, te, tqe_q); } else { LIST_INSERT_AFTER(p, te, tqe_q); } present: /* * Present data to user, advancing rcv_nxt through * completed sequence space. */ if (!TCPS_HAVEESTABLISHED(tp->t_state)) return (0); q = LIST_FIRST(&tp->t_segq); if (!q || q->tqe_th->th_seq != tp->rcv_nxt) return (0); + SOCKBUF_LOCK(&so->so_rcv); do { tp->rcv_nxt += q->tqe_len; flags = q->tqe_th->th_flags & TH_FIN; nq = LIST_NEXT(q, tqe_q); LIST_REMOVE(q, tqe_q); + /* Unlocked read. */ if (so->so_rcv.sb_state & SBS_CANTRCVMORE) m_freem(q->tqe_m); else - sbappendstream(&so->so_rcv, q->tqe_m); + sbappendstream_locked(&so->so_rcv, q->tqe_m); uma_zfree(tcp_reass_zone, q); tp->t_segqlen--; tcp_reass_qsize--; q = nq; } while (q && q->tqe_th->th_seq == tp->rcv_nxt); ND6_HINT(tp); - sorwakeup(so); + sorwakeup_locked(so); return (flags); } /* * TCP input routine, follows pages 65-76 of the * protocol specification dated September, 1981 very closely. */ #ifdef INET6 int tcp6_input(mp, offp, proto) struct mbuf **mp; int *offp, proto; { register struct mbuf *m = *mp; struct in6_ifaddr *ia6; IP6_EXTHDR_CHECK(m, *offp, sizeof(struct tcphdr), IPPROTO_DONE); /* * draft-itojun-ipv6-tcp-to-anycast * better place to put this in? */ ia6 = ip6_getdstifaddr(m); if (ia6 && (ia6->ia6_flags & IN6_IFF_ANYCAST)) { struct ip6_hdr *ip6; ip6 = mtod(m, struct ip6_hdr *); icmp6_error(m, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_ADDR, (caddr_t)&ip6->ip6_dst - (caddr_t)ip6); return IPPROTO_DONE; } tcp_input(m, *offp); return IPPROTO_DONE; } #endif void tcp_input(m, off0) register struct mbuf *m; int off0; { register struct tcphdr *th; register struct ip *ip = NULL; register struct ipovly *ipov; register struct inpcb *inp = NULL; u_char *optp = NULL; int optlen = 0; int len, tlen, off; int drop_hdrlen; register struct tcpcb *tp = 0; register int thflags; struct socket *so = 0; int todrop, acked, ourfinisacked, needoutput = 0; u_long tiwin; struct tcpopt to; /* options in this segment */ struct rmxp_tao tao; /* our TAO cache entry */ int headlocked = 0; struct sockaddr_in *next_hop = NULL; int rstreason; /* For badport_bandlim accounting purposes */ struct ip6_hdr *ip6 = NULL; #ifdef INET6 int isipv6; #else const int isipv6 = 0; #endif #ifdef TCPDEBUG /* * The size of tcp_saveipgen must be the size of the max ip header, * now IPv6. */ u_char tcp_saveipgen[40]; struct tcphdr tcp_savetcp; short ostate = 0; #endif /* Grab info from PACKET_TAG_IPFORWARD tag prepended to the chain. */ next_hop = m_claim_next(m, PACKET_TAG_IPFORWARD); #ifdef INET6 isipv6 = (mtod(m, struct ip *)->ip_v == 6) ? 1 : 0; #endif bzero(&tao, sizeof(tao)); bzero((char *)&to, sizeof(to)); tcpstat.tcps_rcvtotal++; if (isipv6) { #ifdef INET6 /* IP6_EXTHDR_CHECK() is already done at tcp6_input() */ ip6 = mtod(m, struct ip6_hdr *); tlen = sizeof(*ip6) + ntohs(ip6->ip6_plen) - off0; if (in6_cksum(m, IPPROTO_TCP, off0, tlen)) { tcpstat.tcps_rcvbadsum++; goto drop; } th = (struct tcphdr *)((caddr_t)ip6 + off0); /* * Be proactive about unspecified IPv6 address in source. * As we use all-zero to indicate unbounded/unconnected pcb, * unspecified IPv6 address can be used to confuse us. * * Note that packets with unspecified IPv6 destination is * already dropped in ip6_input. */ if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) { /* XXX stat */ goto drop; } #else th = NULL; /* XXX: avoid compiler warning */ #endif } else { /* * Get IP and TCP header together in first mbuf. * Note: IP leaves IP header in first mbuf. */ if (off0 > sizeof (struct ip)) { ip_stripoptions(m, (struct mbuf *)0); off0 = sizeof(struct ip); } if (m->m_len < sizeof (struct tcpiphdr)) { if ((m = m_pullup(m, sizeof (struct tcpiphdr))) == 0) { tcpstat.tcps_rcvshort++; return; } } ip = mtod(m, struct ip *); ipov = (struct ipovly *)ip; th = (struct tcphdr *)((caddr_t)ip + off0); tlen = ip->ip_len; if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) { if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) th->th_sum = m->m_pkthdr.csum_data; else th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htonl(m->m_pkthdr.csum_data + ip->ip_len + IPPROTO_TCP)); th->th_sum ^= 0xffff; #ifdef TCPDEBUG ipov->ih_len = (u_short)tlen; ipov->ih_len = htons(ipov->ih_len); #endif } else { /* * Checksum extended TCP header and data. */ len = sizeof (struct ip) + tlen; bzero(ipov->ih_x1, sizeof(ipov->ih_x1)); ipov->ih_len = (u_short)tlen; ipov->ih_len = htons(ipov->ih_len); th->th_sum = in_cksum(m, len); } if (th->th_sum) { tcpstat.tcps_rcvbadsum++; goto drop; } #ifdef INET6 /* Re-initialization for later version check */ ip->ip_v = IPVERSION; #endif } /* * Check that TCP offset makes sense, * pull out TCP options and adjust length. XXX */ off = th->th_off << 2; if (off < sizeof (struct tcphdr) || off > tlen) { tcpstat.tcps_rcvbadoff++; goto drop; } tlen -= off; /* tlen is used instead of ti->ti_len */ if (off > sizeof (struct tcphdr)) { if (isipv6) { #ifdef INET6 IP6_EXTHDR_CHECK(m, off0, off, ); ip6 = mtod(m, struct ip6_hdr *); th = (struct tcphdr *)((caddr_t)ip6 + off0); #endif } else { if (m->m_len < sizeof(struct ip) + off) { if ((m = m_pullup(m, sizeof (struct ip) + off)) == 0) { tcpstat.tcps_rcvshort++; return; } ip = mtod(m, struct ip *); ipov = (struct ipovly *)ip; th = (struct tcphdr *)((caddr_t)ip + off0); } } optlen = off - sizeof (struct tcphdr); optp = (u_char *)(th + 1); } thflags = th->th_flags; #ifdef TCP_DROP_SYNFIN /* * If the drop_synfin option is enabled, drop all packets with * both the SYN and FIN bits set. This prevents e.g. nmap from * identifying the TCP/IP stack. * * This is a violation of the TCP specification. */ if (drop_synfin && (thflags & (TH_SYN|TH_FIN)) == (TH_SYN|TH_FIN)) goto drop; #endif /* * Convert TCP protocol specific fields to host format. */ th->th_seq = ntohl(th->th_seq); th->th_ack = ntohl(th->th_ack); th->th_win = ntohs(th->th_win); th->th_urp = ntohs(th->th_urp); /* * Delay dropping TCP, IP headers, IPv6 ext headers, and TCP options, * until after ip6_savecontrol() is called and before other functions * which don't want those proto headers. * Because ip6_savecontrol() is going to parse the mbuf to * search for data to be passed up to user-land, it wants mbuf * parameters to be unchanged. * XXX: the call of ip6_savecontrol() has been obsoleted based on * latest version of the advanced API (20020110). */ drop_hdrlen = off0 + off; /* * Locate pcb for segment. */ INP_INFO_WLOCK(&tcbinfo); headlocked = 1; findpcb: /* IPFIREWALL_FORWARD section */ if (next_hop != NULL && isipv6 == 0) { /* IPv6 support is not yet */ /* * Transparently forwarded. Pretend to be the destination. * already got one like this? */ inp = in_pcblookup_hash(&tcbinfo, ip->ip_src, th->th_sport, ip->ip_dst, th->th_dport, 0, m->m_pkthdr.rcvif); if (!inp) { /* It's new. Try find the ambushing socket. */ inp = in_pcblookup_hash(&tcbinfo, ip->ip_src, th->th_sport, next_hop->sin_addr, next_hop->sin_port ? ntohs(next_hop->sin_port) : th->th_dport, 1, m->m_pkthdr.rcvif); } } else { if (isipv6) { #ifdef INET6 inp = in6_pcblookup_hash(&tcbinfo, &ip6->ip6_src, th->th_sport, &ip6->ip6_dst, th->th_dport, 1, m->m_pkthdr.rcvif); #endif } else inp = in_pcblookup_hash(&tcbinfo, ip->ip_src, th->th_sport, ip->ip_dst, th->th_dport, 1, m->m_pkthdr.rcvif); } #if defined(IPSEC) || defined(FAST_IPSEC) #ifdef INET6 if (isipv6) { if (inp != NULL && ipsec6_in_reject(m, inp)) { #ifdef IPSEC ipsec6stat.in_polvio++; #endif goto drop; } } else #endif /* INET6 */ if (inp != NULL && ipsec4_in_reject(m, inp)) { #ifdef IPSEC ipsecstat.in_polvio++; #endif goto drop; } #endif /*IPSEC || FAST_IPSEC*/ /* * If the state is CLOSED (i.e., TCB does not exist) then * all data in the incoming segment is discarded. * If the TCB exists but is in CLOSED state, it is embryonic, * but should either do a listen or a connect soon. */ if (inp == NULL) { if (log_in_vain) { #ifdef INET6 char dbuf[INET6_ADDRSTRLEN+2], sbuf[INET6_ADDRSTRLEN+2]; #else char dbuf[4*sizeof "123"], sbuf[4*sizeof "123"]; #endif if (isipv6) { #ifdef INET6 strcpy(dbuf, "["); strcpy(sbuf, "["); strcat(dbuf, ip6_sprintf(&ip6->ip6_dst)); strcat(sbuf, ip6_sprintf(&ip6->ip6_src)); strcat(dbuf, "]"); strcat(sbuf, "]"); #endif } else { strcpy(dbuf, inet_ntoa(ip->ip_dst)); strcpy(sbuf, inet_ntoa(ip->ip_src)); } switch (log_in_vain) { case 1: if ((thflags & TH_SYN) == 0) break; /* FALLTHROUGH */ case 2: log(LOG_INFO, "Connection attempt to TCP %s:%d " "from %s:%d flags:0x%02x\n", dbuf, ntohs(th->th_dport), sbuf, ntohs(th->th_sport), thflags); break; default: break; } } if (blackhole) { switch (blackhole) { case 1: if (thflags & TH_SYN) goto drop; break; case 2: goto drop; default: goto drop; } } rstreason = BANDLIM_RST_CLOSEDPORT; goto dropwithreset; } INP_LOCK(inp); if (inp->inp_vflag & INP_TIMEWAIT) { /* * The only option of relevance is TOF_CC, and only if * present in a SYN segment. See tcp_timewait(). */ if (thflags & TH_SYN) tcp_dooptions((struct tcpcb *)NULL, &to, optp, optlen, 1, th); if (tcp_timewait((struct tcptw *)inp->inp_ppcb, &to, th, m, tlen)) goto findpcb; /* * tcp_timewait unlocks inp. */ INP_INFO_WUNLOCK(&tcbinfo); return; } tp = intotcpcb(inp); if (tp == 0) { INP_UNLOCK(inp); rstreason = BANDLIM_RST_CLOSEDPORT; goto dropwithreset; } if (tp->t_state == TCPS_CLOSED) goto drop; /* Unscale the window into a 32-bit value. */ if ((thflags & TH_SYN) == 0) tiwin = th->th_win << tp->snd_scale; else tiwin = th->th_win; #ifdef MAC INP_LOCK_ASSERT(inp); if (mac_check_inpcb_deliver(inp, m)) goto drop; #endif so = inp->inp_socket; #ifdef TCPDEBUG if (so->so_options & SO_DEBUG) { ostate = tp->t_state; if (isipv6) bcopy((char *)ip6, (char *)tcp_saveipgen, sizeof(*ip6)); else bcopy((char *)ip, (char *)tcp_saveipgen, sizeof(*ip)); tcp_savetcp = *th; } #endif if (so->so_options & SO_ACCEPTCONN) { struct in_conninfo inc; #ifdef INET6 inc.inc_isipv6 = isipv6; #endif if (isipv6) { inc.inc6_faddr = ip6->ip6_src; inc.inc6_laddr = ip6->ip6_dst; } else { inc.inc_faddr = ip->ip_src; inc.inc_laddr = ip->ip_dst; } inc.inc_fport = th->th_sport; inc.inc_lport = th->th_dport; /* * If the state is LISTEN then ignore segment if it contains * a RST. If the segment contains an ACK then it is bad and * send a RST. If it does not contain a SYN then it is not * interesting; drop it. * * If the state is SYN_RECEIVED (syncache) and seg contains * an ACK, but not for our SYN/ACK, send a RST. If the seg * contains a RST, check the sequence number to see if it * is a valid reset segment. */ if ((thflags & (TH_RST|TH_ACK|TH_SYN)) != TH_SYN) { if ((thflags & (TH_RST|TH_ACK|TH_SYN)) == TH_ACK) { if (!syncache_expand(&inc, th, &so, m)) { /* * No syncache entry, or ACK was not * for our SYN/ACK. Send a RST. */ tcpstat.tcps_badsyn++; rstreason = BANDLIM_RST_OPENPORT; goto dropwithreset; } if (so == NULL) { /* * Could not complete 3-way handshake, * connection is being closed down, and * syncache will free mbuf. */ INP_UNLOCK(inp); INP_INFO_WUNLOCK(&tcbinfo); return; } /* * Socket is created in state SYN_RECEIVED. * Continue processing segment. */ INP_UNLOCK(inp); inp = sotoinpcb(so); INP_LOCK(inp); tp = intotcpcb(inp); /* * This is what would have happened in * tcp_output() when the SYN,ACK was sent. */ tp->snd_up = tp->snd_una; tp->snd_max = tp->snd_nxt = tp->iss + 1; tp->last_ack_sent = tp->rcv_nxt; /* * RFC1323: The window in SYN & SYN/ACK * segments is never scaled. */ tp->snd_wnd = tiwin; /* unscaled */ goto after_listen; } if (thflags & TH_RST) { syncache_chkrst(&inc, th); goto drop; } if (thflags & TH_ACK) { syncache_badack(&inc); tcpstat.tcps_badsyn++; rstreason = BANDLIM_RST_OPENPORT; goto dropwithreset; } goto drop; } /* * Segment's flags are (SYN) or (SYN|FIN). */ #ifdef INET6 /* * If deprecated address is forbidden, * we do not accept SYN to deprecated interface * address to prevent any new inbound connection from * getting established. * When we do not accept SYN, we send a TCP RST, * with deprecated source address (instead of dropping * it). We compromise it as it is much better for peer * to send a RST, and RST will be the final packet * for the exchange. * * If we do not forbid deprecated addresses, we accept * the SYN packet. RFC2462 does not suggest dropping * SYN in this case. * If we decipher RFC2462 5.5.4, it says like this: * 1. use of deprecated addr with existing * communication is okay - "SHOULD continue to be * used" * 2. use of it with new communication: * (2a) "SHOULD NOT be used if alternate address * with sufficient scope is available" * (2b) nothing mentioned otherwise. * Here we fall into (2b) case as we have no choice in * our source address selection - we must obey the peer. * * The wording in RFC2462 is confusing, and there are * multiple description text for deprecated address * handling - worse, they are not exactly the same. * I believe 5.5.4 is the best one, so we follow 5.5.4. */ if (isipv6 && !ip6_use_deprecated) { struct in6_ifaddr *ia6; if ((ia6 = ip6_getdstifaddr(m)) && (ia6->ia6_flags & IN6_IFF_DEPRECATED)) { INP_UNLOCK(inp); tp = NULL; rstreason = BANDLIM_RST_OPENPORT; goto dropwithreset; } } #endif /* * If it is from this socket, drop it, it must be forged. * Don't bother responding if the destination was a broadcast. */ if (th->th_dport == th->th_sport) { if (isipv6) { if (IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, &ip6->ip6_src)) goto drop; } else { if (ip->ip_dst.s_addr == ip->ip_src.s_addr) goto drop; } } /* * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN * * Note that it is quite possible to receive unicast * link-layer packets with a broadcast IP address. Use * in_broadcast() to find them. */ if (m->m_flags & (M_BCAST|M_MCAST)) goto drop; if (isipv6) { if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) || IN6_IS_ADDR_MULTICAST(&ip6->ip6_src)) goto drop; } else { if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) || IN_MULTICAST(ntohl(ip->ip_src.s_addr)) || ip->ip_src.s_addr == htonl(INADDR_BROADCAST) || in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) goto drop; } /* * SYN appears to be valid; create compressed TCP state * for syncache, or perform t/tcp connection. */ if (so->so_qlen <= so->so_qlimit) { #ifdef TCPDEBUG if (so->so_options & SO_DEBUG) tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif tcp_dooptions(tp, &to, optp, optlen, 1, th); if (!syncache_add(&inc, &to, th, &so, m)) goto drop; if (so == NULL) { /* * Entry added to syncache, mbuf used to * send SYN,ACK packet. */ KASSERT(headlocked, ("headlocked")); INP_UNLOCK(inp); INP_INFO_WUNLOCK(&tcbinfo); return; } /* * Segment passed TAO tests. */ INP_UNLOCK(inp); inp = sotoinpcb(so); INP_LOCK(inp); tp = intotcpcb(inp); tp->snd_wnd = tiwin; tp->t_starttime = ticks; tp->t_state = TCPS_ESTABLISHED; /* * T/TCP logic: * If there is a FIN or if there is data, then * delay SYN,ACK(SYN) in the hope of piggy-backing * it on a response segment. Otherwise must send * ACK now in case the other side is slow starting. */ if (thflags & TH_FIN || tlen != 0) tp->t_flags |= (TF_DELACK | TF_NEEDSYN); else tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN); tcpstat.tcps_connects++; soisconnected(so); goto trimthenstep6; } goto drop; } after_listen: /* XXX temp debugging */ /* should not happen - syncache should pick up these connections */ if (tp->t_state == TCPS_LISTEN) panic("tcp_input: TCPS_LISTEN"); /* * This is the second part of the MSS DoS prevention code (after * minmss on the sending side) and it deals with too many too small * tcp packets in a too short timeframe (1 second). * * For every full second we count the number of received packets * and bytes. If we get a lot of packets per second for this connection * (tcp_minmssoverload) we take a closer look at it and compute the * average packet size for the past second. If that is less than * tcp_minmss we get too many packets with very small payload which * is not good and burdens our system (and every packet generates * a wakeup to the process connected to our socket). We can reasonable * expect this to be small packet DoS attack to exhaust our CPU * cycles. * * Care has to be taken for the minimum packet overload value. This * value defines the minimum number of packets per second before we * start to worry. This must not be too low to avoid killing for * example interactive connections with many small packets like * telnet or SSH. * * Setting either tcp_minmssoverload or tcp_minmss to "0" disables * this check. * * Account for packet if payload packet, skip over ACK, etc. */ if (tcp_minmss && tcp_minmssoverload && tp->t_state == TCPS_ESTABLISHED && tlen > 0) { if (tp->rcv_second > ticks) { tp->rcv_pps++; tp->rcv_byps += tlen + off; if (tp->rcv_pps > tcp_minmssoverload) { if ((tp->rcv_byps / tp->rcv_pps) < tcp_minmss) { printf("too many small tcp packets from " "%s:%u, av. %lubyte/packet, " "dropping connection\n", #ifdef INET6 isipv6 ? ip6_sprintf(&inp->inp_inc.inc6_faddr) : #endif inet_ntoa(inp->inp_inc.inc_faddr), inp->inp_inc.inc_fport, tp->rcv_byps / tp->rcv_pps); tp = tcp_drop(tp, ECONNRESET); tcpstat.tcps_minmssdrops++; goto drop; } } } else { tp->rcv_second = ticks + hz; tp->rcv_pps = 1; tp->rcv_byps = tlen + off; } } /* * Segment received on connection. * Reset idle time and keep-alive timer. */ tp->t_rcvtime = ticks; if (TCPS_HAVEESTABLISHED(tp->t_state)) callout_reset(tp->tt_keep, tcp_keepidle, tcp_timer_keep, tp); /* * Process options only when we get SYN/ACK back. The SYN case * for incoming connections is handled in tcp_syncache. * XXX this is traditional behavior, may need to be cleaned up. */ tcp_dooptions(tp, &to, optp, optlen, thflags & TH_SYN, th); if (thflags & TH_SYN) { if (to.to_flags & TOF_SCALE) { tp->t_flags |= TF_RCVD_SCALE; tp->requested_s_scale = to.to_requested_s_scale; } if (to.to_flags & TOF_TS) { tp->t_flags |= TF_RCVD_TSTMP; tp->ts_recent = to.to_tsval; tp->ts_recent_age = ticks; } if (to.to_flags & (TOF_CC|TOF_CCNEW)) tp->t_flags |= TF_RCVD_CC; if (to.to_flags & TOF_MSS) tcp_mss(tp, to.to_mss); if (tp->sack_enable) { if (!(to.to_flags & TOF_SACK)) tp->sack_enable = 0; else tp->t_flags |= TF_SACK_PERMIT; } } if (tp->sack_enable) { /* Delete stale (cumulatively acked) SACK holes */ tcp_del_sackholes(tp, th); tp->rcv_laststart = th->th_seq; /* last recv'd segment*/ tp->rcv_lastend = th->th_seq + tlen; } /* * Header prediction: check for the two common cases * of a uni-directional data xfer. If the packet has * no control flags, is in-sequence, the window didn't * change and we're not retransmitting, it's a * candidate. If the length is zero and the ack moved * forward, we're the sender side of the xfer. Just * free the data acked & wake any higher level process * that was blocked waiting for space. If the length * is non-zero and the ack didn't move, we're the * receiver side. If we're getting packets in-order * (the reassembly queue is empty), add the data to * the socket buffer and note that we need a delayed ack. * Make sure that the hidden state-flags are also off. * Since we check for TCPS_ESTABLISHED above, it can only * be TH_NEEDSYN. */ if (tp->t_state == TCPS_ESTABLISHED && (thflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK && ((tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN)) == 0) && ((to.to_flags & TOF_TS) == 0 || TSTMP_GEQ(to.to_tsval, tp->ts_recent)) && /* * Using the CC option is compulsory if once started: * the segment is OK if no T/TCP was negotiated or * if the segment has a CC option equal to CCrecv */ ((tp->t_flags & (TF_REQ_CC|TF_RCVD_CC)) != (TF_REQ_CC|TF_RCVD_CC) || ((to.to_flags & TOF_CC) != 0 && to.to_cc == tp->cc_recv)) && th->th_seq == tp->rcv_nxt && tiwin && tiwin == tp->snd_wnd && tp->snd_nxt == tp->snd_max) { /* * If last ACK falls within this segment's sequence numbers, * record the timestamp. * NOTE that the test is modified according to the latest * proposal of the tcplw@cray.com list (Braden 1993/04/26). */ if ((to.to_flags & TOF_TS) != 0 && SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { tp->ts_recent_age = ticks; tp->ts_recent = to.to_tsval; } if (tlen == 0) { if (SEQ_GT(th->th_ack, tp->snd_una) && SEQ_LEQ(th->th_ack, tp->snd_max) && tp->snd_cwnd >= tp->snd_wnd && ((!tcp_do_newreno && !tp->sack_enable && tp->t_dupacks < tcprexmtthresh) || ((tcp_do_newreno || tp->sack_enable) && !IN_FASTRECOVERY(tp)))) { KASSERT(headlocked, ("headlocked")); INP_INFO_WUNLOCK(&tcbinfo); /* * this is a pure ack for outstanding data. */ ++tcpstat.tcps_predack; /* * "bad retransmit" recovery */ if (tp->t_rxtshift == 1 && ticks < tp->t_badrxtwin) { ++tcpstat.tcps_sndrexmitbad; tp->snd_cwnd = tp->snd_cwnd_prev; tp->snd_ssthresh = tp->snd_ssthresh_prev; tp->snd_recover = tp->snd_recover_prev; if (tp->t_flags & TF_WASFRECOVERY) ENTER_FASTRECOVERY(tp); tp->snd_nxt = tp->snd_max; tp->t_badrxtwin = 0; } /* * Recalculate the transmit timer / rtt. * * Some boxes send broken timestamp replies * during the SYN+ACK phase, ignore * timestamps of 0 or we could calculate a * huge RTT and blow up the retransmit timer. */ if ((to.to_flags & TOF_TS) != 0 && to.to_tsecr) { tcp_xmit_timer(tp, ticks - to.to_tsecr + 1); } else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) { tcp_xmit_timer(tp, ticks - tp->t_rtttime); } tcp_xmit_bandwidth_limit(tp, th->th_ack); acked = th->th_ack - tp->snd_una; tcpstat.tcps_rcvackpack++; tcpstat.tcps_rcvackbyte += acked; sbdrop(&so->so_snd, acked); if (SEQ_GT(tp->snd_una, tp->snd_recover) && SEQ_LEQ(th->th_ack, tp->snd_recover)) tp->snd_recover = th->th_ack - 1; tp->snd_una = th->th_ack; /* * pull snd_wl2 up to prevent seq wrap relative * to th_ack. */ tp->snd_wl2 = th->th_ack; tp->t_dupacks = 0; m_freem(m); ND6_HINT(tp); /* some progress has been done */ /* * If all outstanding data are acked, stop * retransmit timer, otherwise restart timer * using current (possibly backed-off) value. * If process is waiting for space, * wakeup/selwakeup/signal. If data * are ready to send, let tcp_output * decide between more output or persist. #ifdef TCPDEBUG if (so->so_options & SO_DEBUG) tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif */ if (tp->snd_una == tp->snd_max) callout_stop(tp->tt_rexmt); else if (!callout_active(tp->tt_persist)) callout_reset(tp->tt_rexmt, tp->t_rxtcur, tcp_timer_rexmt, tp); sowwakeup(so); if (so->so_snd.sb_cc) (void) tcp_output(tp); goto check_delack; } } else if (th->th_ack == tp->snd_una && LIST_EMPTY(&tp->t_segq) && tlen <= sbspace(&so->so_rcv)) { KASSERT(headlocked, ("headlocked")); INP_INFO_WUNLOCK(&tcbinfo); /* * this is a pure, in-sequence data packet * with nothing on the reassembly queue and * we have enough buffer space to take it. */ /* Clean receiver SACK report if present */ if (tp->sack_enable && tp->rcv_numsacks) tcp_clean_sackreport(tp); ++tcpstat.tcps_preddat; tp->rcv_nxt += tlen; /* * Pull snd_wl1 up to prevent seq wrap relative to * th_seq. */ tp->snd_wl1 = th->th_seq; /* * Pull rcv_up up to prevent seq wrap relative to * rcv_nxt. */ tp->rcv_up = tp->rcv_nxt; tcpstat.tcps_rcvpack++; tcpstat.tcps_rcvbyte += tlen; ND6_HINT(tp); /* some progress has been done */ /* #ifdef TCPDEBUG if (so->so_options & SO_DEBUG) tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif * Add data to socket buffer. */ + /* Unlocked read. */ + SOCKBUF_LOCK(&so->so_rcv); if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { m_freem(m); } else { m_adj(m, drop_hdrlen); /* delayed header drop */ - sbappendstream(&so->so_rcv, m); + sbappendstream_locked(&so->so_rcv, m); } - sorwakeup(so); + sorwakeup_locked(so); if (DELAY_ACK(tp)) { tp->t_flags |= TF_DELACK; } else { tp->t_flags |= TF_ACKNOW; tcp_output(tp); } goto check_delack; } } /* * Calculate amount of space in receive window, * and then do TCP input processing. * Receive window is amount of space in rcv queue, * but not less than advertised window. */ { int win; win = sbspace(&so->so_rcv); if (win < 0) win = 0; tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt)); } switch (tp->t_state) { /* * If the state is SYN_RECEIVED: * if seg contains an ACK, but not for our SYN/ACK, send a RST. */ case TCPS_SYN_RECEIVED: if ((thflags & TH_ACK) && (SEQ_LEQ(th->th_ack, tp->snd_una) || SEQ_GT(th->th_ack, tp->snd_max))) { rstreason = BANDLIM_RST_OPENPORT; goto dropwithreset; } break; /* * If the state is SYN_SENT: * if seg contains an ACK, but not for our SYN, drop the input. * if seg contains a RST, then drop the connection. * if seg does not contain SYN, then drop it. * Otherwise this is an acceptable SYN segment * initialize tp->rcv_nxt and tp->irs * if seg contains ack then advance tp->snd_una * if SYN has been acked change to ESTABLISHED else SYN_RCVD state * arrange for segment to be acked (eventually) * continue processing rest of data/controls, beginning with URG */ case TCPS_SYN_SENT: if (tcp_do_rfc1644) tcp_hc_gettao(&inp->inp_inc, &tao); if ((thflags & TH_ACK) && (SEQ_LEQ(th->th_ack, tp->iss) || SEQ_GT(th->th_ack, tp->snd_max))) { /* * If we have a cached CCsent for the remote host, * hence we haven't just crashed and restarted, * do not send a RST. This may be a retransmission * from the other side after our earlier ACK was lost. * Our new SYN, when it arrives, will serve as the * needed ACK. */ if (tao.tao_ccsent != 0) goto drop; else { rstreason = BANDLIM_UNLIMITED; goto dropwithreset; } } if (thflags & TH_RST) { if (thflags & TH_ACK) tp = tcp_drop(tp, ECONNREFUSED); goto drop; } if ((thflags & TH_SYN) == 0) goto drop; tp->snd_wnd = th->th_win; /* initial send window */ tp->cc_recv = to.to_cc; /* foreign CC */ tp->irs = th->th_seq; tcp_rcvseqinit(tp); if (thflags & TH_ACK) { /* * Our SYN was acked. If segment contains CC.ECHO * option, check it to make sure this segment really * matches our SYN. If not, just drop it as old * duplicate, but send an RST if we're still playing * by the old rules. If no CC.ECHO option, make sure * we don't get fooled into using T/TCP. */ if (to.to_flags & TOF_CCECHO) { if (tp->cc_send != to.to_ccecho) { if (tao.tao_ccsent != 0) goto drop; else { rstreason = BANDLIM_UNLIMITED; goto dropwithreset; } } } else tp->t_flags &= ~TF_RCVD_CC; tcpstat.tcps_connects++; soisconnected(so); #ifdef MAC SOCK_LOCK(so); mac_set_socket_peer_from_mbuf(m, so); SOCK_UNLOCK(so); #endif /* Do window scaling on this connection? */ if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == (TF_RCVD_SCALE|TF_REQ_SCALE)) { tp->snd_scale = tp->requested_s_scale; tp->rcv_scale = tp->request_r_scale; } /* Segment is acceptable, update cache if undefined. */ if (tao.tao_ccsent == 0 && tcp_do_rfc1644) tcp_hc_updatetao(&inp->inp_inc, TCP_HC_TAO_CCSENT, to.to_ccecho, 0); tp->rcv_adv += tp->rcv_wnd; tp->snd_una++; /* SYN is acked */ /* * If there's data, delay ACK; if there's also a FIN * ACKNOW will be turned on later. */ if (DELAY_ACK(tp) && tlen != 0) callout_reset(tp->tt_delack, tcp_delacktime, tcp_timer_delack, tp); else tp->t_flags |= TF_ACKNOW; /* * Received in SYN_SENT[*] state. * Transitions: * SYN_SENT --> ESTABLISHED * SYN_SENT* --> FIN_WAIT_1 */ tp->t_starttime = ticks; if (tp->t_flags & TF_NEEDFIN) { tp->t_state = TCPS_FIN_WAIT_1; tp->t_flags &= ~TF_NEEDFIN; thflags &= ~TH_SYN; } else { tp->t_state = TCPS_ESTABLISHED; callout_reset(tp->tt_keep, tcp_keepidle, tcp_timer_keep, tp); } } else { /* * Received initial SYN in SYN-SENT[*] state => * simultaneous open. If segment contains CC option * and there is a cached CC, apply TAO test. * If it succeeds, connection is * half-synchronized. * Otherwise, do 3-way handshake: * SYN-SENT -> SYN-RECEIVED * SYN-SENT* -> SYN-RECEIVED* * If there was no CC option, clear cached CC value. */ tp->t_flags |= TF_ACKNOW; callout_stop(tp->tt_rexmt); if (to.to_flags & TOF_CC) { if (tao.tao_cc != 0 && CC_GT(to.to_cc, tao.tao_cc)) { /* * update cache and make transition: * SYN-SENT -> ESTABLISHED* * SYN-SENT* -> FIN-WAIT-1* */ tao.tao_cc = to.to_cc; tcp_hc_updatetao(&inp->inp_inc, TCP_HC_TAO_CC, to.to_cc, 0); tp->t_starttime = ticks; if (tp->t_flags & TF_NEEDFIN) { tp->t_state = TCPS_FIN_WAIT_1; tp->t_flags &= ~TF_NEEDFIN; } else { tp->t_state = TCPS_ESTABLISHED; callout_reset(tp->tt_keep, tcp_keepidle, tcp_timer_keep, tp); } tp->t_flags |= TF_NEEDSYN; } else tp->t_state = TCPS_SYN_RECEIVED; } else { if (tcp_do_rfc1644) { /* CC.NEW or no option => invalidate cache */ tao.tao_cc = 0; tcp_hc_updatetao(&inp->inp_inc, TCP_HC_TAO_CC, to.to_cc, 0); } tp->t_state = TCPS_SYN_RECEIVED; } } trimthenstep6: /* * Advance th->th_seq to correspond to first data byte. * If data, trim to stay within window, * dropping FIN if necessary. */ th->th_seq++; if (tlen > tp->rcv_wnd) { todrop = tlen - tp->rcv_wnd; m_adj(m, -todrop); tlen = tp->rcv_wnd; thflags &= ~TH_FIN; tcpstat.tcps_rcvpackafterwin++; tcpstat.tcps_rcvbyteafterwin += todrop; } tp->snd_wl1 = th->th_seq - 1; tp->rcv_up = th->th_seq; /* * Client side of transaction: already sent SYN and data. * If the remote host used T/TCP to validate the SYN, * our data will be ACK'd; if so, enter normal data segment * processing in the middle of step 5, ack processing. * Otherwise, goto step 6. */ if (thflags & TH_ACK) goto process_ACK; goto step6; /* * If the state is LAST_ACK or CLOSING or TIME_WAIT: * if segment contains a SYN and CC [not CC.NEW] option: * if state == TIME_WAIT and connection duration > MSL, * drop packet and send RST; * * if SEG.CC > CCrecv then is new SYN, and can implicitly * ack the FIN (and data) in retransmission queue. * Complete close and delete TCPCB. Then reprocess * segment, hoping to find new TCPCB in LISTEN state; * * else must be old SYN; drop it. * else do normal processing. */ case TCPS_LAST_ACK: case TCPS_CLOSING: case TCPS_TIME_WAIT: KASSERT(tp->t_state != TCPS_TIME_WAIT, ("timewait")); if ((thflags & TH_SYN) && (to.to_flags & TOF_CC) && tp->cc_recv != 0) { if (tp->t_state == TCPS_TIME_WAIT && (ticks - tp->t_starttime) > tcp_msl) { rstreason = BANDLIM_UNLIMITED; goto dropwithreset; } if (CC_GT(to.to_cc, tp->cc_recv)) { tp = tcp_close(tp); goto findpcb; } else goto drop; } break; /* continue normal processing */ } /* * States other than LISTEN or SYN_SENT. * First check the RST flag and sequence number since reset segments * are exempt from the timestamp and connection count tests. This * fixes a bug introduced by the Stevens, vol. 2, p. 960 bugfix * below which allowed reset segments in half the sequence space * to fall though and be processed (which gives forged reset * segments with a random sequence number a 50 percent chance of * killing a connection). * Then check timestamp, if present. * Then check the connection count, if present. * Then check that at least some bytes of segment are within * receive window. If segment begins before rcv_nxt, * drop leading data (and SYN); if nothing left, just ack. * * * If the RST bit is set, check the sequence number to see * if this is a valid reset segment. * RFC 793 page 37: * In all states except SYN-SENT, all reset (RST) segments * are validated by checking their SEQ-fields. A reset is * valid if its sequence number is in the window. * Note: this does not take into account delayed ACKs, so * we should test against last_ack_sent instead of rcv_nxt. * The sequence number in the reset segment is normally an * echo of our outgoing acknowlegement numbers, but some hosts * send a reset with the sequence number at the rightmost edge * of our receive window, and we have to handle this case. * Note 2: Paul Watson's paper "Slipping in the Window" has shown * that brute force RST attacks are possible. To combat this, * we use a much stricter check while in the ESTABLISHED state, * only accepting RSTs where the sequence number is equal to * last_ack_sent. In all other states (the states in which a * RST is more likely), the more permissive check is used. * If we have multiple segments in flight, the intial reset * segment sequence numbers will be to the left of last_ack_sent, * but they will eventually catch up. * In any case, it never made sense to trim reset segments to * fit the receive window since RFC 1122 says: * 4.2.2.12 RST Segment: RFC-793 Section 3.4 * * A TCP SHOULD allow a received RST segment to include data. * * DISCUSSION * It has been suggested that a RST segment could contain * ASCII text that encoded and explained the cause of the * RST. No standard has yet been established for such * data. * * If the reset segment passes the sequence number test examine * the state: * SYN_RECEIVED STATE: * If passive open, return to LISTEN state. * If active open, inform user that connection was refused. * ESTABLISHED, FIN_WAIT_1, FIN_WAIT_2, CLOSE_WAIT STATES: * Inform user that connection was reset, and close tcb. * CLOSING, LAST_ACK STATES: * Close the tcb. * TIME_WAIT STATE: * Drop the segment - see Stevens, vol. 2, p. 964 and * RFC 1337. */ if (thflags & TH_RST) { if (SEQ_GEQ(th->th_seq, tp->last_ack_sent) && SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) { switch (tp->t_state) { case TCPS_SYN_RECEIVED: so->so_error = ECONNREFUSED; goto close; case TCPS_ESTABLISHED: if (tp->last_ack_sent != th->th_seq) { tcpstat.tcps_badrst++; goto drop; } case TCPS_FIN_WAIT_1: case TCPS_FIN_WAIT_2: case TCPS_CLOSE_WAIT: so->so_error = ECONNRESET; close: tp->t_state = TCPS_CLOSED; tcpstat.tcps_drops++; tp = tcp_close(tp); break; case TCPS_CLOSING: case TCPS_LAST_ACK: tp = tcp_close(tp); break; case TCPS_TIME_WAIT: KASSERT(tp->t_state != TCPS_TIME_WAIT, ("timewait")); break; } } goto drop; } /* * RFC 1323 PAWS: If we have a timestamp reply on this segment * and it's less than ts_recent, drop it. */ if ((to.to_flags & TOF_TS) != 0 && tp->ts_recent && TSTMP_LT(to.to_tsval, tp->ts_recent)) { /* Check to see if ts_recent is over 24 days old. */ if ((int)(ticks - tp->ts_recent_age) > TCP_PAWS_IDLE) { /* * Invalidate ts_recent. If this segment updates * ts_recent, the age will be reset later and ts_recent * will get a valid value. If it does not, setting * ts_recent to zero will at least satisfy the * requirement that zero be placed in the timestamp * echo reply when ts_recent isn't valid. The * age isn't reset until we get a valid ts_recent * because we don't want out-of-order segments to be * dropped when ts_recent is old. */ tp->ts_recent = 0; } else { tcpstat.tcps_rcvduppack++; tcpstat.tcps_rcvdupbyte += tlen; tcpstat.tcps_pawsdrop++; if (tlen) goto dropafterack; goto drop; } } /* * T/TCP mechanism * If T/TCP was negotiated and the segment doesn't have CC, * or if its CC is wrong then drop the segment. * RST segments do not have to comply with this. */ if ((tp->t_flags & (TF_REQ_CC|TF_RCVD_CC)) == (TF_REQ_CC|TF_RCVD_CC) && ((to.to_flags & TOF_CC) == 0 || tp->cc_recv != to.to_cc)) goto dropafterack; /* * In the SYN-RECEIVED state, validate that the packet belongs to * this connection before trimming the data to fit the receive * window. Check the sequence number versus IRS since we know * the sequence numbers haven't wrapped. This is a partial fix * for the "LAND" DoS attack. */ if (tp->t_state == TCPS_SYN_RECEIVED && SEQ_LT(th->th_seq, tp->irs)) { rstreason = BANDLIM_RST_OPENPORT; goto dropwithreset; } todrop = tp->rcv_nxt - th->th_seq; if (todrop > 0) { if (thflags & TH_SYN) { thflags &= ~TH_SYN; th->th_seq++; if (th->th_urp > 1) th->th_urp--; else thflags &= ~TH_URG; todrop--; } /* * Following if statement from Stevens, vol. 2, p. 960. */ if (todrop > tlen || (todrop == tlen && (thflags & TH_FIN) == 0)) { /* * Any valid FIN must be to the left of the window. * At this point the FIN must be a duplicate or out * of sequence; drop it. */ thflags &= ~TH_FIN; /* * Send an ACK to resynchronize and drop any data. * But keep on processing for RST or ACK. */ tp->t_flags |= TF_ACKNOW; todrop = tlen; tcpstat.tcps_rcvduppack++; tcpstat.tcps_rcvdupbyte += todrop; } else { tcpstat.tcps_rcvpartduppack++; tcpstat.tcps_rcvpartdupbyte += todrop; } drop_hdrlen += todrop; /* drop from the top afterwards */ th->th_seq += todrop; tlen -= todrop; if (th->th_urp > todrop) th->th_urp -= todrop; else { thflags &= ~TH_URG; th->th_urp = 0; } } /* * If new data are received on a connection after the * user processes are gone, then RST the other end. */ if ((so->so_state & SS_NOFDREF) && tp->t_state > TCPS_CLOSE_WAIT && tlen) { tp = tcp_close(tp); tcpstat.tcps_rcvafterclose++; rstreason = BANDLIM_UNLIMITED; goto dropwithreset; } /* * If segment ends after window, drop trailing data * (and PUSH and FIN); if nothing left, just ACK. */ todrop = (th->th_seq+tlen) - (tp->rcv_nxt+tp->rcv_wnd); if (todrop > 0) { tcpstat.tcps_rcvpackafterwin++; if (todrop >= tlen) { tcpstat.tcps_rcvbyteafterwin += tlen; /* * If a new connection request is received * while in TIME_WAIT, drop the old connection * and start over if the sequence numbers * are above the previous ones. */ KASSERT(tp->t_state != TCPS_TIME_WAIT, ("timewait")); if (thflags & TH_SYN && tp->t_state == TCPS_TIME_WAIT && SEQ_GT(th->th_seq, tp->rcv_nxt)) { tp = tcp_close(tp); goto findpcb; } /* * If window is closed can only take segments at * window edge, and have to drop data and PUSH from * incoming segments. Continue processing, but * remember to ack. Otherwise, drop segment * and ack. */ if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) { tp->t_flags |= TF_ACKNOW; tcpstat.tcps_rcvwinprobe++; } else goto dropafterack; } else tcpstat.tcps_rcvbyteafterwin += todrop; m_adj(m, -todrop); tlen -= todrop; thflags &= ~(TH_PUSH|TH_FIN); } /* * If last ACK falls within this segment's sequence numbers, * record its timestamp. * NOTE that the test is modified according to the latest * proposal of the tcplw@cray.com list (Braden 1993/04/26). */ if ((to.to_flags & TOF_TS) != 0 && SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { tp->ts_recent_age = ticks; tp->ts_recent = to.to_tsval; } /* * If a SYN is in the window, then this is an * error and we send an RST and drop the connection. */ if (thflags & TH_SYN) { tp = tcp_drop(tp, ECONNRESET); rstreason = BANDLIM_UNLIMITED; goto drop; } /* * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN * flag is on (half-synchronized state), then queue data for * later processing; else drop segment and return. */ if ((thflags & TH_ACK) == 0) { if (tp->t_state == TCPS_SYN_RECEIVED || (tp->t_flags & TF_NEEDSYN)) goto step6; else goto drop; } /* * Ack processing. */ switch (tp->t_state) { /* * In SYN_RECEIVED state, the ack ACKs our SYN, so enter * ESTABLISHED state and continue processing. * The ACK was checked above. */ case TCPS_SYN_RECEIVED: tcpstat.tcps_connects++; soisconnected(so); /* Do window scaling? */ if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == (TF_RCVD_SCALE|TF_REQ_SCALE)) { tp->snd_scale = tp->requested_s_scale; tp->rcv_scale = tp->request_r_scale; } /* * Upon successful completion of 3-way handshake, * update cache.CC, pass any queued data to the user, * and advance state appropriately. */ if (tcp_do_rfc1644) { tao.tao_cc = tp->cc_recv; tcp_hc_updatetao(&inp->inp_inc, TCP_HC_TAO_CC, tp->cc_recv, 0); } /* * Make transitions: * SYN-RECEIVED -> ESTABLISHED * SYN-RECEIVED* -> FIN-WAIT-1 */ tp->t_starttime = ticks; if (tp->t_flags & TF_NEEDFIN) { tp->t_state = TCPS_FIN_WAIT_1; tp->t_flags &= ~TF_NEEDFIN; } else { tp->t_state = TCPS_ESTABLISHED; callout_reset(tp->tt_keep, tcp_keepidle, tcp_timer_keep, tp); } /* * If segment contains data or ACK, will call tcp_reass() * later; if not, do so now to pass queued data to user. */ if (tlen == 0 && (thflags & TH_FIN) == 0) (void) tcp_reass(tp, (struct tcphdr *)0, 0, (struct mbuf *)0); tp->snd_wl1 = th->th_seq - 1; /* FALLTHROUGH */ /* * In ESTABLISHED state: drop duplicate ACKs; ACK out of range * ACKs. If the ack is in the range * tp->snd_una < th->th_ack <= tp->snd_max * then advance tp->snd_una to th->th_ack and drop * data from the retransmission queue. If this ACK reflects * more up to date window information we update our window information. */ case TCPS_ESTABLISHED: case TCPS_FIN_WAIT_1: case TCPS_FIN_WAIT_2: case TCPS_CLOSE_WAIT: case TCPS_CLOSING: case TCPS_LAST_ACK: case TCPS_TIME_WAIT: KASSERT(tp->t_state != TCPS_TIME_WAIT, ("timewait")); if (SEQ_LEQ(th->th_ack, tp->snd_una)) { if (tlen == 0 && tiwin == tp->snd_wnd) { tcpstat.tcps_rcvdupack++; /* * If we have outstanding data (other than * a window probe), this is a completely * duplicate ack (ie, window info didn't * change), the ack is the biggest we've * seen and we've seen exactly our rexmt * threshhold of them, assume a packet * has been dropped and retransmit it. * Kludge snd_nxt & the congestion * window so we send only this one * packet. * * We know we're losing at the current * window size so do congestion avoidance * (set ssthresh to half the current window * and pull our congestion window back to * the new ssthresh). * * Dup acks mean that packets have left the * network (they're now cached at the receiver) * so bump cwnd by the amount in the receiver * to keep a constant cwnd packets in the * network. */ if (!callout_active(tp->tt_rexmt) || th->th_ack != tp->snd_una) tp->t_dupacks = 0; else if (++tp->t_dupacks > tcprexmtthresh || ((tcp_do_newreno || tp->sack_enable) && IN_FASTRECOVERY(tp))) { tp->snd_cwnd += tp->t_maxseg; (void) tcp_output(tp); goto drop; } else if (tp->t_dupacks == tcprexmtthresh) { tcp_seq onxt = tp->snd_nxt; u_int win; if ((tcp_do_newreno || tp->sack_enable) && SEQ_LEQ(th->th_ack, tp->snd_recover)) { tp->t_dupacks = 0; break; } win = min(tp->snd_wnd, tp->snd_cwnd) / 2 / tp->t_maxseg; if (win < 2) win = 2; tp->snd_ssthresh = win * tp->t_maxseg; ENTER_FASTRECOVERY(tp); tp->snd_recover = tp->snd_max; callout_stop(tp->tt_rexmt); tp->t_rtttime = 0; if (tp->sack_enable) { tcpstat.tcps_sack_recovery_episode++; tp->snd_cwnd = tp->t_maxseg * tp->t_dupacks; (void) tcp_output(tp); tp->snd_cwnd = tp->snd_ssthresh; goto drop; } tp->snd_nxt = th->th_ack; tp->snd_cwnd = tp->t_maxseg; (void) tcp_output(tp); KASSERT(tp->snd_limited <= 2, ("tp->snd_limited too big")); tp->snd_cwnd = tp->snd_ssthresh + tp->t_maxseg * (tp->t_dupacks - tp->snd_limited); if (SEQ_GT(onxt, tp->snd_nxt)) tp->snd_nxt = onxt; goto drop; } else if (tcp_do_rfc3042) { u_long oldcwnd = tp->snd_cwnd; tcp_seq oldsndmax = tp->snd_max; u_int sent; KASSERT(tp->t_dupacks == 1 || tp->t_dupacks == 2, ("dupacks not 1 or 2")); if (tp->t_dupacks == 1) tp->snd_limited = 0; tp->snd_cwnd = (tp->snd_nxt - tp->snd_una) + (tp->t_dupacks - tp->snd_limited) * tp->t_maxseg; (void) tcp_output(tp); sent = tp->snd_max - oldsndmax; if (sent > tp->t_maxseg) { KASSERT((tp->t_dupacks == 2 && tp->snd_limited == 0) || (sent == tp->t_maxseg + 1 && tp->t_flags & TF_SENTFIN), ("sent too much")); tp->snd_limited = 2; } else if (sent > 0) ++tp->snd_limited; tp->snd_cwnd = oldcwnd; goto drop; } } else tp->t_dupacks = 0; break; } KASSERT(SEQ_GT(th->th_ack, tp->snd_una), ("th_ack <= snd_una")); /* * If the congestion window was inflated to account * for the other side's cached packets, retract it. */ if (tcp_do_newreno || tp->sack_enable) { if (IN_FASTRECOVERY(tp)) { if (SEQ_LT(th->th_ack, tp->snd_recover)) { if (tp->sack_enable) tcp_sack_partialack(tp, th); else tcp_newreno_partial_ack(tp, th); } else { /* * Out of fast recovery. * Window inflation should have left us * with approximately snd_ssthresh * outstanding data. * But in case we would be inclined to * send a burst, better to do it via * the slow start mechanism. */ if (SEQ_GT(th->th_ack + tp->snd_ssthresh, tp->snd_max)) tp->snd_cwnd = tp->snd_max - th->th_ack + tp->t_maxseg; else tp->snd_cwnd = tp->snd_ssthresh; } } } else { if (tp->t_dupacks >= tcprexmtthresh && tp->snd_cwnd > tp->snd_ssthresh) tp->snd_cwnd = tp->snd_ssthresh; } tp->t_dupacks = 0; if (SEQ_GT(th->th_ack, tp->snd_max)) { tcpstat.tcps_rcvacktoomuch++; goto dropafterack; } /* * If we reach this point, ACK is not a duplicate, * i.e., it ACKs something we sent. */ if (tp->t_flags & TF_NEEDSYN) { /* * T/TCP: Connection was half-synchronized, and our * SYN has been ACK'd (so connection is now fully * synchronized). Go to non-starred state, * increment snd_una for ACK of SYN, and check if * we can do window scaling. */ tp->t_flags &= ~TF_NEEDSYN; tp->snd_una++; /* Do window scaling? */ if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == (TF_RCVD_SCALE|TF_REQ_SCALE)) { tp->snd_scale = tp->requested_s_scale; tp->rcv_scale = tp->request_r_scale; } } process_ACK: acked = th->th_ack - tp->snd_una; tcpstat.tcps_rcvackpack++; tcpstat.tcps_rcvackbyte += acked; /* * If we just performed our first retransmit, and the ACK * arrives within our recovery window, then it was a mistake * to do the retransmit in the first place. Recover our * original cwnd and ssthresh, and proceed to transmit where * we left off. */ if (tp->t_rxtshift == 1 && ticks < tp->t_badrxtwin) { ++tcpstat.tcps_sndrexmitbad; tp->snd_cwnd = tp->snd_cwnd_prev; tp->snd_ssthresh = tp->snd_ssthresh_prev; tp->snd_recover = tp->snd_recover_prev; if (tp->t_flags & TF_WASFRECOVERY) ENTER_FASTRECOVERY(tp); tp->snd_nxt = tp->snd_max; tp->t_badrxtwin = 0; /* XXX probably not required */ } /* * If we have a timestamp reply, update smoothed * round trip time. If no timestamp is present but * transmit timer is running and timed sequence * number was acked, update smoothed round trip time. * Since we now have an rtt measurement, cancel the * timer backoff (cf., Phil Karn's retransmit alg.). * Recompute the initial retransmit timer. * * Some boxes send broken timestamp replies * during the SYN+ACK phase, ignore * timestamps of 0 or we could calculate a * huge RTT and blow up the retransmit timer. */ if ((to.to_flags & TOF_TS) != 0 && to.to_tsecr) { tcp_xmit_timer(tp, ticks - to.to_tsecr + 1); } else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) { tcp_xmit_timer(tp, ticks - tp->t_rtttime); } tcp_xmit_bandwidth_limit(tp, th->th_ack); /* * If all outstanding data is acked, stop retransmit * timer and remember to restart (more output or persist). * If there is more data to be acked, restart retransmit * timer, using current (possibly backed-off) value. */ if (th->th_ack == tp->snd_max) { callout_stop(tp->tt_rexmt); needoutput = 1; } else if (!callout_active(tp->tt_persist)) callout_reset(tp->tt_rexmt, tp->t_rxtcur, tcp_timer_rexmt, tp); /* * If no data (only SYN) was ACK'd, * skip rest of ACK processing. */ if (acked == 0) goto step6; /* * When new data is acked, open the congestion window. * If the window gives us less than ssthresh packets * in flight, open exponentially (maxseg per packet). * Otherwise open linearly: maxseg per window * (maxseg^2 / cwnd per packet). */ if ((!tcp_do_newreno && !tp->sack_enable) || !IN_FASTRECOVERY(tp)) { register u_int cw = tp->snd_cwnd; register u_int incr = tp->t_maxseg; if (cw > tp->snd_ssthresh) incr = incr * incr / cw; tp->snd_cwnd = min(cw+incr, TCP_MAXWIN<snd_scale); } SOCKBUF_LOCK(&so->so_snd); if (acked > so->so_snd.sb_cc) { tp->snd_wnd -= so->so_snd.sb_cc; sbdrop_locked(&so->so_snd, (int)so->so_snd.sb_cc); ourfinisacked = 1; } else { sbdrop_locked(&so->so_snd, acked); tp->snd_wnd -= acked; ourfinisacked = 0; } - SOCKBUF_UNLOCK(&so->so_snd); - sowwakeup(so); + sowwakeup_locked(so); /* detect una wraparound */ if ((tcp_do_newreno || tp->sack_enable) && !IN_FASTRECOVERY(tp) && SEQ_GT(tp->snd_una, tp->snd_recover) && SEQ_LEQ(th->th_ack, tp->snd_recover)) tp->snd_recover = th->th_ack - 1; if ((tcp_do_newreno || tp->sack_enable) && IN_FASTRECOVERY(tp) && SEQ_GEQ(th->th_ack, tp->snd_recover)) EXIT_FASTRECOVERY(tp); tp->snd_una = th->th_ack; if (tp->sack_enable) { if (SEQ_GT(tp->snd_una, tp->snd_recover)) tp->snd_recover = tp->snd_una; } if (SEQ_LT(tp->snd_nxt, tp->snd_una)) tp->snd_nxt = tp->snd_una; switch (tp->t_state) { /* * In FIN_WAIT_1 STATE in addition to the processing * for the ESTABLISHED state if our FIN is now acknowledged * then enter FIN_WAIT_2. */ case TCPS_FIN_WAIT_1: if (ourfinisacked) { /* * If we can't receive any more * data, then closing user can proceed. * Starting the timer is contrary to the * specification, but if we don't get a FIN * we'll hang forever. */ /* XXXjl * we should release the tp also, and use a * compressed state. */ if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { soisdisconnected(so); callout_reset(tp->tt_2msl, tcp_maxidle, tcp_timer_2msl, tp); } tp->t_state = TCPS_FIN_WAIT_2; } break; /* * In CLOSING STATE in addition to the processing for * the ESTABLISHED state if the ACK acknowledges our FIN * then enter the TIME-WAIT state, otherwise ignore * the segment. */ case TCPS_CLOSING: if (ourfinisacked) { KASSERT(headlocked, ("headlocked")); tcp_twstart(tp); INP_INFO_WUNLOCK(&tcbinfo); m_freem(m); return; } break; /* * In LAST_ACK, we may still be waiting for data to drain * and/or to be acked, as well as for the ack of our FIN. * If our FIN is now acknowledged, delete the TCB, * enter the closed state and return. */ case TCPS_LAST_ACK: if (ourfinisacked) { tp = tcp_close(tp); goto drop; } break; /* * In TIME_WAIT state the only thing that should arrive * is a retransmission of the remote FIN. Acknowledge * it and restart the finack timer. */ case TCPS_TIME_WAIT: KASSERT(tp->t_state != TCPS_TIME_WAIT, ("timewait")); callout_reset(tp->tt_2msl, 2 * tcp_msl, tcp_timer_2msl, tp); goto dropafterack; } } step6: /* * Update window information. * Don't look at window if no ACK: TAC's send garbage on first SYN. */ if ((thflags & TH_ACK) && (SEQ_LT(tp->snd_wl1, th->th_seq) || (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) || (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) { /* keep track of pure window updates */ if (tlen == 0 && tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) tcpstat.tcps_rcvwinupd++; tp->snd_wnd = tiwin; tp->snd_wl1 = th->th_seq; tp->snd_wl2 = th->th_ack; if (tp->snd_wnd > tp->max_sndwnd) tp->max_sndwnd = tp->snd_wnd; needoutput = 1; } /* * Process segments with URG. */ if ((thflags & TH_URG) && th->th_urp && TCPS_HAVERCVDFIN(tp->t_state) == 0) { /* * This is a kludge, but if we receive and accept * random urgent pointers, we'll crash in * soreceive. It's hard to imagine someone * actually wanting to send this much urgent data. */ if (th->th_urp + so->so_rcv.sb_cc > sb_max) { th->th_urp = 0; /* XXX */ thflags &= ~TH_URG; /* XXX */ goto dodata; /* XXX */ } /* * If this segment advances the known urgent pointer, * then mark the data stream. This should not happen * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since * a FIN has been received from the remote side. * In these states we ignore the URG. * * According to RFC961 (Assigned Protocols), * the urgent pointer points to the last octet * of urgent data. We continue, however, * to consider it to indicate the first octet * of data past the urgent section as the original * spec states (in one of two places). */ if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) { tp->rcv_up = th->th_seq + th->th_urp; SOCKBUF_LOCK(&so->so_rcv); so->so_oobmark = so->so_rcv.sb_cc + (tp->rcv_up - tp->rcv_nxt) - 1; if (so->so_oobmark == 0) so->so_rcv.sb_state |= SBS_RCVATMARK; SOCKBUF_UNLOCK(&so->so_rcv); sohasoutofband(so); tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA); } /* * Remove out of band data so doesn't get presented to user. * This can happen independent of advancing the URG pointer, * but if two URG's are pending at once, some out-of-band * data may creep in... ick. */ if (th->th_urp <= (u_long)tlen && !(so->so_options & SO_OOBINLINE)) { /* hdr drop is delayed */ tcp_pulloutofband(so, th, m, drop_hdrlen); } } else { /* * If no out of band data is expected, * pull receive urgent pointer along * with the receive window. */ if (SEQ_GT(tp->rcv_nxt, tp->rcv_up)) tp->rcv_up = tp->rcv_nxt; } dodata: /* XXX */ KASSERT(headlocked, ("headlocked")); /* * Process the segment text, merging it into the TCP sequencing queue, * and arranging for acknowledgment of receipt if necessary. * This process logically involves adjusting tp->rcv_wnd as data * is presented to the user (this happens in tcp_usrreq.c, * case PRU_RCVD). If a FIN has already been received on this * connection then we just ignore the text. */ if ((tlen || (thflags & TH_FIN)) && TCPS_HAVERCVDFIN(tp->t_state) == 0) { m_adj(m, drop_hdrlen); /* delayed header drop */ /* * Insert segment which includes th into TCP reassembly queue * with control block tp. Set thflags to whether reassembly now * includes a segment with FIN. This handles the common case * inline (segment is the next to be received on an established * connection, and the queue is empty), avoiding linkage into * and removal from the queue and repetition of various * conversions. * Set DELACK for segments received in order, but ack * immediately when segments are out of order (so * fast retransmit can work). */ if (th->th_seq == tp->rcv_nxt && LIST_EMPTY(&tp->t_segq) && TCPS_HAVEESTABLISHED(tp->t_state)) { if (DELAY_ACK(tp)) tp->t_flags |= TF_DELACK; else tp->t_flags |= TF_ACKNOW; tp->rcv_nxt += tlen; thflags = th->th_flags & TH_FIN; tcpstat.tcps_rcvpack++; tcpstat.tcps_rcvbyte += tlen; ND6_HINT(tp); + /* Unlocked read. */ + SOCKBUF_LOCK(&so->so_rcv); if (so->so_rcv.sb_state & SBS_CANTRCVMORE) m_freem(m); else - sbappendstream(&so->so_rcv, m); - sorwakeup(so); + sbappendstream_locked(&so->so_rcv, m); + sorwakeup_locked(so); } else { thflags = tcp_reass(tp, th, &tlen, m); tp->t_flags |= TF_ACKNOW; } if (tp->sack_enable) tcp_update_sack_list(tp); /* * Note the amount of data that peer has sent into * our window, in order to estimate the sender's * buffer size. */ len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt); } else { m_freem(m); thflags &= ~TH_FIN; } /* * If FIN is received ACK the FIN and let the user know * that the connection is closing. */ if (thflags & TH_FIN) { if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { socantrcvmore(so); /* * If connection is half-synchronized * (ie NEEDSYN flag on) then delay ACK, * so it may be piggybacked when SYN is sent. * Otherwise, since we received a FIN then no * more input can be expected, send ACK now. */ if (tp->t_flags & TF_NEEDSYN) tp->t_flags |= TF_DELACK; else tp->t_flags |= TF_ACKNOW; tp->rcv_nxt++; } switch (tp->t_state) { /* * In SYN_RECEIVED and ESTABLISHED STATES * enter the CLOSE_WAIT state. */ case TCPS_SYN_RECEIVED: tp->t_starttime = ticks; /*FALLTHROUGH*/ case TCPS_ESTABLISHED: tp->t_state = TCPS_CLOSE_WAIT; break; /* * If still in FIN_WAIT_1 STATE FIN has not been acked so * enter the CLOSING state. */ case TCPS_FIN_WAIT_1: tp->t_state = TCPS_CLOSING; break; /* * In FIN_WAIT_2 state enter the TIME_WAIT state, * starting the time-wait timer, turning off the other * standard timers. */ case TCPS_FIN_WAIT_2: KASSERT(headlocked == 1, ("headlocked should be 1")); tcp_twstart(tp); INP_INFO_WUNLOCK(&tcbinfo); return; /* * In TIME_WAIT state restart the 2 MSL time_wait timer. */ case TCPS_TIME_WAIT: KASSERT(tp->t_state != TCPS_TIME_WAIT, ("timewait")); callout_reset(tp->tt_2msl, 2 * tcp_msl, tcp_timer_2msl, tp); break; } } INP_INFO_WUNLOCK(&tcbinfo); #ifdef TCPDEBUG if (so->so_options & SO_DEBUG) tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif /* * Return any desired output. */ if (needoutput || (tp->t_flags & TF_ACKNOW)) (void) tcp_output(tp); check_delack: if (tp->t_flags & TF_DELACK) { tp->t_flags &= ~TF_DELACK; callout_reset(tp->tt_delack, tcp_delacktime, tcp_timer_delack, tp); } INP_UNLOCK(inp); return; dropafterack: /* * Generate an ACK dropping incoming segment if it occupies * sequence space, where the ACK reflects our state. * * We can now skip the test for the RST flag since all * paths to this code happen after packets containing * RST have been dropped. * * In the SYN-RECEIVED state, don't send an ACK unless the * segment we received passes the SYN-RECEIVED ACK test. * If it fails send a RST. This breaks the loop in the * "LAND" DoS attack, and also prevents an ACK storm * between two listening ports that have been sent forged * SYN segments, each with the source address of the other. */ if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) && (SEQ_GT(tp->snd_una, th->th_ack) || SEQ_GT(th->th_ack, tp->snd_max)) ) { rstreason = BANDLIM_RST_OPENPORT; goto dropwithreset; } #ifdef TCPDEBUG if (so->so_options & SO_DEBUG) tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif KASSERT(headlocked, ("headlocked should be 1")); INP_INFO_WUNLOCK(&tcbinfo); m_freem(m); tp->t_flags |= TF_ACKNOW; (void) tcp_output(tp); INP_UNLOCK(inp); return; dropwithreset: /* * Generate a RST, dropping incoming segment. * Make ACK acceptable to originator of segment. * Don't bother to respond if destination was broadcast/multicast. */ if ((thflags & TH_RST) || m->m_flags & (M_BCAST|M_MCAST)) goto drop; if (isipv6) { if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) || IN6_IS_ADDR_MULTICAST(&ip6->ip6_src)) goto drop; } else { if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) || IN_MULTICAST(ntohl(ip->ip_src.s_addr)) || ip->ip_src.s_addr == htonl(INADDR_BROADCAST) || in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) goto drop; } /* IPv6 anycast check is done at tcp6_input() */ /* * Perform bandwidth limiting. */ if (badport_bandlim(rstreason) < 0) goto drop; #ifdef TCPDEBUG if (tp == 0 || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif if (thflags & TH_ACK) /* mtod() below is safe as long as hdr dropping is delayed */ tcp_respond(tp, mtod(m, void *), th, m, (tcp_seq)0, th->th_ack, TH_RST); else { if (thflags & TH_SYN) tlen++; /* mtod() below is safe as long as hdr dropping is delayed */ tcp_respond(tp, mtod(m, void *), th, m, th->th_seq+tlen, (tcp_seq)0, TH_RST|TH_ACK); } if (tp) INP_UNLOCK(inp); if (headlocked) INP_INFO_WUNLOCK(&tcbinfo); return; drop: /* * Drop space held by incoming segment and return. */ #ifdef TCPDEBUG if (tp == 0 || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif if (tp) INP_UNLOCK(inp); m_freem(m); if (headlocked) INP_INFO_WUNLOCK(&tcbinfo); return; } /* * Parse TCP options and place in tcpopt. */ static void tcp_dooptions(tp, to, cp, cnt, is_syn, th) struct tcpcb *tp; struct tcpopt *to; u_char *cp; int cnt; int is_syn; struct tcphdr *th; { int opt, optlen; to->to_flags = 0; for (; cnt > 0; cnt -= optlen, cp += optlen) { opt = cp[0]; if (opt == TCPOPT_EOL) break; if (opt == TCPOPT_NOP) optlen = 1; else { if (cnt < 2) break; optlen = cp[1]; if (optlen < 2 || optlen > cnt) break; } switch (opt) { case TCPOPT_MAXSEG: if (optlen != TCPOLEN_MAXSEG) continue; if (!is_syn) continue; to->to_flags |= TOF_MSS; bcopy((char *)cp + 2, (char *)&to->to_mss, sizeof(to->to_mss)); to->to_mss = ntohs(to->to_mss); break; case TCPOPT_WINDOW: if (optlen != TCPOLEN_WINDOW) continue; if (! is_syn) continue; to->to_flags |= TOF_SCALE; to->to_requested_s_scale = min(cp[2], TCP_MAX_WINSHIFT); break; case TCPOPT_TIMESTAMP: if (optlen != TCPOLEN_TIMESTAMP) continue; to->to_flags |= TOF_TS; bcopy((char *)cp + 2, (char *)&to->to_tsval, sizeof(to->to_tsval)); to->to_tsval = ntohl(to->to_tsval); bcopy((char *)cp + 6, (char *)&to->to_tsecr, sizeof(to->to_tsecr)); to->to_tsecr = ntohl(to->to_tsecr); break; case TCPOPT_CC: if (optlen != TCPOLEN_CC) continue; to->to_flags |= TOF_CC; bcopy((char *)cp + 2, (char *)&to->to_cc, sizeof(to->to_cc)); to->to_cc = ntohl(to->to_cc); break; case TCPOPT_CCNEW: if (optlen != TCPOLEN_CC) continue; if (!is_syn) continue; to->to_flags |= TOF_CCNEW; bcopy((char *)cp + 2, (char *)&to->to_cc, sizeof(to->to_cc)); to->to_cc = ntohl(to->to_cc); break; case TCPOPT_CCECHO: if (optlen != TCPOLEN_CC) continue; if (!is_syn) continue; to->to_flags |= TOF_CCECHO; bcopy((char *)cp + 2, (char *)&to->to_ccecho, sizeof(to->to_ccecho)); to->to_ccecho = ntohl(to->to_ccecho); break; #ifdef TCP_SIGNATURE /* * XXX In order to reply to a host which has set the * TCP_SIGNATURE option in its initial SYN, we have to * record the fact that the option was observed here * for the syncache code to perform the correct response. */ case TCPOPT_SIGNATURE: if (optlen != TCPOLEN_SIGNATURE) continue; to->to_flags |= (TOF_SIGNATURE | TOF_SIGLEN); break; #endif case TCPOPT_SACK_PERMITTED: if (!tcp_do_sack || optlen != TCPOLEN_SACK_PERMITTED) continue; if (is_syn) { /* MUST only be set on SYN */ to->to_flags |= TOF_SACK; } break; case TCPOPT_SACK: if (!tp || tcp_sack_option(tp, th, cp, optlen)) continue; break; default: continue; } } } /* * Pull out of band byte out of a segment so * it doesn't appear in the user's data queue. * It is still reflected in the segment length for * sequencing purposes. */ static void tcp_pulloutofband(so, th, m, off) struct socket *so; struct tcphdr *th; register struct mbuf *m; int off; /* delayed to be droped hdrlen */ { int cnt = off + th->th_urp - 1; while (cnt >= 0) { if (m->m_len > cnt) { char *cp = mtod(m, caddr_t) + cnt; struct tcpcb *tp = sototcpcb(so); tp->t_iobc = *cp; tp->t_oobflags |= TCPOOB_HAVEDATA; bcopy(cp+1, cp, (unsigned)(m->m_len - cnt - 1)); m->m_len--; if (m->m_flags & M_PKTHDR) m->m_pkthdr.len--; return; } cnt -= m->m_len; m = m->m_next; if (m == 0) break; } panic("tcp_pulloutofband"); } /* * Collect new round-trip time estimate * and update averages and current timeout. */ static void tcp_xmit_timer(tp, rtt) register struct tcpcb *tp; int rtt; { register int delta; tcpstat.tcps_rttupdated++; tp->t_rttupdated++; if (tp->t_srtt != 0) { /* * srtt is stored as fixed point with 5 bits after the * binary point (i.e., scaled by 8). The following magic * is equivalent to the smoothing algorithm in rfc793 with * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed * point). Adjust rtt to origin 0. */ delta = ((rtt - 1) << TCP_DELTA_SHIFT) - (tp->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT)); if ((tp->t_srtt += delta) <= 0) tp->t_srtt = 1; /* * We accumulate a smoothed rtt variance (actually, a * smoothed mean difference), then set the retransmit * timer to smoothed rtt + 4 times the smoothed variance. * rttvar is stored as fixed point with 4 bits after the * binary point (scaled by 16). The following is * equivalent to rfc793 smoothing with an alpha of .75 * (rttvar = rttvar*3/4 + |delta| / 4). This replaces * rfc793's wired-in beta. */ if (delta < 0) delta = -delta; delta -= tp->t_rttvar >> (TCP_RTTVAR_SHIFT - TCP_DELTA_SHIFT); if ((tp->t_rttvar += delta) <= 0) tp->t_rttvar = 1; if (tp->t_rttbest > tp->t_srtt + tp->t_rttvar) tp->t_rttbest = tp->t_srtt + tp->t_rttvar; } else { /* * No rtt measurement yet - use the unsmoothed rtt. * Set the variance to half the rtt (so our first * retransmit happens at 3*rtt). */ tp->t_srtt = rtt << TCP_RTT_SHIFT; tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - 1); tp->t_rttbest = tp->t_srtt + tp->t_rttvar; } tp->t_rtttime = 0; tp->t_rxtshift = 0; /* * the retransmit should happen at rtt + 4 * rttvar. * Because of the way we do the smoothing, srtt and rttvar * will each average +1/2 tick of bias. When we compute * the retransmit timer, we want 1/2 tick of rounding and * 1 extra tick because of +-1/2 tick uncertainty in the * firing of the timer. The bias will give us exactly the * 1.5 tick we need. But, because the bias is * statistical, we have to test that we don't drop below * the minimum feasible timer (which is 2 ticks). */ TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp), max(tp->t_rttmin, rtt + 2), TCPTV_REXMTMAX); /* * We received an ack for a packet that wasn't retransmitted; * it is probably safe to discard any error indications we've * received recently. This isn't quite right, but close enough * for now (a route might have failed after we sent a segment, * and the return path might not be symmetrical). */ tp->t_softerror = 0; } /* * Determine a reasonable value for maxseg size. * If the route is known, check route for mtu. * If none, use an mss that can be handled on the outgoing * interface without forcing IP to fragment; if bigger than * an mbuf cluster (MCLBYTES), round down to nearest multiple of MCLBYTES * to utilize large mbufs. If no route is found, route has no mtu, * or the destination isn't local, use a default, hopefully conservative * size (usually 512 or the default IP max size, but no more than the mtu * of the interface), as we can't discover anything about intervening * gateways or networks. We also initialize the congestion/slow start * window to be a single segment if the destination isn't local. * While looking at the routing entry, we also initialize other path-dependent * parameters from pre-set or cached values in the routing entry. * * Also take into account the space needed for options that we * send regularly. Make maxseg shorter by that amount to assure * that we can send maxseg amount of data even when the options * are present. Store the upper limit of the length of options plus * data in maxopd. * * * In case of T/TCP, we call this routine during implicit connection * setup as well (offer = -1), to initialize maxseg from the cached * MSS of our peer. * * NOTE that this routine is only called when we process an incoming * segment. Outgoing SYN/ACK MSS settings are handled in tcp_mssopt(). */ void tcp_mss(tp, offer) struct tcpcb *tp; int offer; { int rtt, mss; u_long bufsize; u_long maxmtu; struct inpcb *inp = tp->t_inpcb; struct socket *so; struct hc_metrics_lite metrics; struct rmxp_tao tao; int origoffer = offer; #ifdef INET6 int isipv6 = ((inp->inp_vflag & INP_IPV6) != 0) ? 1 : 0; size_t min_protoh = isipv6 ? sizeof (struct ip6_hdr) + sizeof (struct tcphdr) : sizeof (struct tcpiphdr); #else const size_t min_protoh = sizeof(struct tcpiphdr); #endif bzero(&tao, sizeof(tao)); /* initialize */ #ifdef INET6 if (isipv6) { maxmtu = tcp_maxmtu6(&inp->inp_inc); tp->t_maxopd = tp->t_maxseg = tcp_v6mssdflt; } else #endif { maxmtu = tcp_maxmtu(&inp->inp_inc); tp->t_maxopd = tp->t_maxseg = tcp_mssdflt; } so = inp->inp_socket; /* * no route to sender, stay with default mss and return */ if (maxmtu == 0) return; /* what have we got? */ switch (offer) { case 0: /* * Offer == 0 means that there was no MSS on the SYN * segment, in this case we use tcp_mssdflt. */ offer = #ifdef INET6 isipv6 ? tcp_v6mssdflt : #endif tcp_mssdflt; break; case -1: /* * Offer == -1 means that we didn't receive SYN yet, * use cached value in that case; */ if (tcp_do_rfc1644) tcp_hc_gettao(&inp->inp_inc, &tao); if (tao.tao_mssopt != 0) offer = tao.tao_mssopt; /* FALLTHROUGH */ default: /* * Prevent DoS attack with too small MSS. Round up * to at least minmss. */ offer = max(offer, tcp_minmss); /* * Sanity check: make sure that maxopd will be large * enough to allow some data on segments even if the * all the option space is used (40bytes). Otherwise * funny things may happen in tcp_output. */ offer = max(offer, 64); if (tcp_do_rfc1644) tcp_hc_updatetao(&inp->inp_inc, TCP_HC_TAO_MSSOPT, 0, offer); } /* * rmx information is now retrieved from tcp_hostcache */ tcp_hc_get(&inp->inp_inc, &metrics); /* * if there's a discovered mtu int tcp hostcache, use it * else, use the link mtu. */ if (metrics.rmx_mtu) mss = min(metrics.rmx_mtu, maxmtu) - min_protoh; else { #ifdef INET6 if (isipv6) { mss = maxmtu - min_protoh; if (!path_mtu_discovery && !in6_localaddr(&inp->in6p_faddr)) mss = min(mss, tcp_v6mssdflt); } else #endif { mss = maxmtu - min_protoh; if (!path_mtu_discovery && !in_localaddr(inp->inp_faddr)) mss = min(mss, tcp_mssdflt); } } mss = min(mss, offer); /* * maxopd stores the maximum length of data AND options * in a segment; maxseg is the amount of data in a normal * segment. We need to store this value (maxopd) apart * from maxseg, because now every segment carries options * and thus we normally have somewhat less data in segments. */ tp->t_maxopd = mss; /* * In case of T/TCP, origoffer==-1 indicates, that no segments * were received yet. In this case we just guess, otherwise * we do the same as before T/TCP. */ if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && (origoffer == -1 || (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP)) mss -= TCPOLEN_TSTAMP_APPA; if ((tp->t_flags & (TF_REQ_CC|TF_NOOPT)) == TF_REQ_CC && (origoffer == -1 || (tp->t_flags & TF_RCVD_CC) == TF_RCVD_CC)) mss -= TCPOLEN_CC_APPA; tp->t_maxseg = mss; #if (MCLBYTES & (MCLBYTES - 1)) == 0 if (mss > MCLBYTES) mss &= ~(MCLBYTES-1); #else if (mss > MCLBYTES) mss = mss / MCLBYTES * MCLBYTES; #endif tp->t_maxseg = mss; /* * If there's a pipesize, change the socket buffer to that size, * don't change if sb_hiwat is different than default (then it * has been changed on purpose with setsockopt). * Make the socket buffers an integral number of mss units; * if the mss is larger than the socket buffer, decrease the mss. */ SOCKBUF_LOCK(&so->so_snd); if ((so->so_snd.sb_hiwat == tcp_sendspace) && metrics.rmx_sendpipe) bufsize = metrics.rmx_sendpipe; else bufsize = so->so_snd.sb_hiwat; if (bufsize < mss) mss = bufsize; else { bufsize = roundup(bufsize, mss); if (bufsize > sb_max) bufsize = sb_max; if (bufsize > so->so_snd.sb_hiwat) (void)sbreserve_locked(&so->so_snd, bufsize, so, NULL); } SOCKBUF_UNLOCK(&so->so_snd); tp->t_maxseg = mss; SOCKBUF_LOCK(&so->so_rcv); if ((so->so_rcv.sb_hiwat == tcp_recvspace) && metrics.rmx_recvpipe) bufsize = metrics.rmx_recvpipe; else bufsize = so->so_rcv.sb_hiwat; if (bufsize > mss) { bufsize = roundup(bufsize, mss); if (bufsize > sb_max) bufsize = sb_max; if (bufsize > so->so_rcv.sb_hiwat) (void)sbreserve_locked(&so->so_rcv, bufsize, so, NULL); } SOCKBUF_UNLOCK(&so->so_rcv); /* * While we're here, check the others too */ if (tp->t_srtt == 0 && (rtt = metrics.rmx_rtt)) { tp->t_srtt = rtt; tp->t_rttbest = tp->t_srtt + TCP_RTT_SCALE; tcpstat.tcps_usedrtt++; if (metrics.rmx_rttvar) { tp->t_rttvar = metrics.rmx_rttvar; tcpstat.tcps_usedrttvar++; } else { /* default variation is +- 1 rtt */ tp->t_rttvar = tp->t_srtt * TCP_RTTVAR_SCALE / TCP_RTT_SCALE; } TCPT_RANGESET(tp->t_rxtcur, ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1, tp->t_rttmin, TCPTV_REXMTMAX); } if (metrics.rmx_ssthresh) { /* * There's some sort of gateway or interface * buffer limit on the path. Use this to set * the slow start threshhold, but set the * threshold to no less than 2*mss. */ tp->snd_ssthresh = max(2 * mss, metrics.rmx_ssthresh); tcpstat.tcps_usedssthresh++; } if (metrics.rmx_bandwidth) tp->snd_bandwidth = metrics.rmx_bandwidth; /* * Set the slow-start flight size depending on whether this * is a local network or not. * * Extend this so we cache the cwnd too and retrieve it here. * Make cwnd even bigger than RFC3390 suggests but only if we * have previous experience with the remote host. Be careful * not make cwnd bigger than remote receive window or our own * send socket buffer. Maybe put some additional upper bound * on the retrieved cwnd. Should do incremental updates to * hostcache when cwnd collapses so next connection doesn't * overloads the path again. * * RFC3390 says only do this if SYN or SYN/ACK didn't got lost. * We currently check only in syncache_socket for that. */ #define TCP_METRICS_CWND #ifdef TCP_METRICS_CWND if (metrics.rmx_cwnd) tp->snd_cwnd = max(mss, min(metrics.rmx_cwnd / 2, min(tp->snd_wnd, so->so_snd.sb_hiwat))); else #endif if (tcp_do_rfc3390) tp->snd_cwnd = min(4 * mss, max(2 * mss, 4380)); #ifdef INET6 else if ((isipv6 && in6_localaddr(&inp->in6p_faddr)) || (!isipv6 && in_localaddr(inp->inp_faddr))) #else else if (in_localaddr(inp->inp_faddr)) #endif tp->snd_cwnd = mss * ss_fltsz_local; else tp->snd_cwnd = mss * ss_fltsz; } /* * Determine the MSS option to send on an outgoing SYN. */ int tcp_mssopt(inc) struct in_conninfo *inc; { int mss = 0; u_long maxmtu = 0; u_long thcmtu = 0; size_t min_protoh; #ifdef INET6 int isipv6 = inc->inc_isipv6 ? 1 : 0; #endif KASSERT(inc != NULL, ("tcp_mssopt with NULL in_conninfo pointer")); #ifdef INET6 if (isipv6) { mss = tcp_v6mssdflt; maxmtu = tcp_maxmtu6(inc); thcmtu = tcp_hc_getmtu(inc); /* IPv4 and IPv6 */ min_protoh = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); } else #endif { mss = tcp_mssdflt; maxmtu = tcp_maxmtu(inc); thcmtu = tcp_hc_getmtu(inc); /* IPv4 and IPv6 */ min_protoh = sizeof(struct tcpiphdr); } if (maxmtu && thcmtu) mss = min(maxmtu, thcmtu) - min_protoh; else if (maxmtu || thcmtu) mss = max(maxmtu, thcmtu) - min_protoh; return (mss); } /* * On a partial ack arrives, force the retransmission of the * next unacknowledged segment. Do not clear tp->t_dupacks. * By setting snd_nxt to ti_ack, this forces retransmission timer to * be started again. */ static void tcp_newreno_partial_ack(tp, th) struct tcpcb *tp; struct tcphdr *th; { tcp_seq onxt = tp->snd_nxt; u_long ocwnd = tp->snd_cwnd; callout_stop(tp->tt_rexmt); tp->t_rtttime = 0; tp->snd_nxt = th->th_ack; /* * Set snd_cwnd to one segment beyond acknowledged offset. * (tp->snd_una has not yet been updated when this function is called.) */ tp->snd_cwnd = tp->t_maxseg + (th->th_ack - tp->snd_una); tp->t_flags |= TF_ACKNOW; (void) tcp_output(tp); tp->snd_cwnd = ocwnd; if (SEQ_GT(onxt, tp->snd_nxt)) tp->snd_nxt = onxt; /* * Partial window deflation. Relies on fact that tp->snd_una * not updated yet. */ tp->snd_cwnd -= (th->th_ack - tp->snd_una - tp->t_maxseg); } /* * Returns 1 if the TIME_WAIT state was killed and we should start over, * looking for a pcb in the listen state. Returns 0 otherwise. */ static int tcp_timewait(tw, to, th, m, tlen) struct tcptw *tw; struct tcpopt *to; struct tcphdr *th; struct mbuf *m; int tlen; { int thflags; tcp_seq seq; #ifdef INET6 int isipv6 = (mtod(m, struct ip *)->ip_v == 6) ? 1 : 0; #else const int isipv6 = 0; #endif thflags = th->th_flags; /* * NOTE: for FIN_WAIT_2 (to be added later), * must validate sequence number before accepting RST */ /* * If the segment contains RST: * Drop the segment - see Stevens, vol. 2, p. 964 and * RFC 1337. */ if (thflags & TH_RST) goto drop; /* * If segment contains a SYN and CC [not CC.NEW] option: * if connection duration > MSL, drop packet and send RST; * * if SEG.CC > CCrecv then is new SYN. * Complete close and delete TCPCB. Then reprocess * segment, hoping to find new TCPCB in LISTEN state; * * else must be old SYN; drop it. * else do normal processing. */ if ((thflags & TH_SYN) && (to->to_flags & TOF_CC) && tw->cc_recv != 0) { if ((ticks - tw->t_starttime) > tcp_msl) goto reset; if (CC_GT(to->to_cc, tw->cc_recv)) { (void) tcp_twclose(tw, 0); return (1); } goto drop; } #if 0 /* PAWS not needed at the moment */ /* * RFC 1323 PAWS: If we have a timestamp reply on this segment * and it's less than ts_recent, drop it. */ if ((to.to_flags & TOF_TS) != 0 && tp->ts_recent && TSTMP_LT(to.to_tsval, tp->ts_recent)) { if ((thflags & TH_ACK) == 0) goto drop; goto ack; } /* * ts_recent is never updated because we never accept new segments. */ #endif /* * If a new connection request is received * while in TIME_WAIT, drop the old connection * and start over if the sequence numbers * are above the previous ones. */ if ((thflags & TH_SYN) && SEQ_GT(th->th_seq, tw->rcv_nxt)) { (void) tcp_twclose(tw, 0); return (1); } /* * Drop the the segment if it does not contain an ACK. */ if ((thflags & TH_ACK) == 0) goto drop; /* * Reset the 2MSL timer if this is a duplicate FIN. */ if (thflags & TH_FIN) { seq = th->th_seq + tlen + (thflags & TH_SYN ? 1 : 0); if (seq + 1 == tw->rcv_nxt) tcp_timer_2msl_reset(tw, 2 * tcp_msl); } /* * Acknowledge the segment if it has data or is not a duplicate ACK. */ if (thflags != TH_ACK || tlen != 0 || th->th_seq != tw->rcv_nxt || th->th_ack != tw->snd_nxt) tcp_twrespond(tw, TH_ACK); goto drop; reset: /* * Generate a RST, dropping incoming segment. * Make ACK acceptable to originator of segment. * Don't bother to respond if destination was broadcast/multicast. */ if (m->m_flags & (M_BCAST|M_MCAST)) goto drop; if (isipv6) { struct ip6_hdr *ip6; /* IPv6 anycast check is done at tcp6_input() */ ip6 = mtod(m, struct ip6_hdr *); if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) || IN6_IS_ADDR_MULTICAST(&ip6->ip6_src)) goto drop; } else { struct ip *ip; ip = mtod(m, struct ip *); if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) || IN_MULTICAST(ntohl(ip->ip_src.s_addr)) || ip->ip_src.s_addr == htonl(INADDR_BROADCAST) || in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) goto drop; } if (thflags & TH_ACK) { tcp_respond(NULL, mtod(m, void *), th, m, 0, th->th_ack, TH_RST); } else { seq = th->th_seq + (thflags & TH_SYN ? 1 : 0); tcp_respond(NULL, mtod(m, void *), th, m, seq, 0, TH_RST|TH_ACK); } INP_UNLOCK(tw->tw_inpcb); return (0); drop: INP_UNLOCK(tw->tw_inpcb); m_freem(m); return (0); } Index: head/sys/netinet/tcp_reass.c =================================================================== --- head/sys/netinet/tcp_reass.c (revision 131150) +++ head/sys/netinet/tcp_reass.c (revision 131151) @@ -1,3315 +1,3320 @@ /* * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)tcp_input.c 8.12 (Berkeley) 5/24/95 * $FreeBSD$ */ #include "opt_ipfw.h" /* for ipfw_fwd */ #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_mac.h" #include "opt_tcpdebug.h" #include "opt_tcp_input.h" #include "opt_tcp_sack.h" #include #include #include #include #include #include /* for proc0 declaration */ #include #include #include #include #include #include #include #include /* before tcp_seq.h, for tcp_random18() */ #include #include #include #include #include #include #include #include #include /* for ICMP_BANDLIM */ #include /* for ICMP_BANDLIM */ #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef TCPDEBUG #include #endif /* TCPDEBUG */ #ifdef FAST_IPSEC #include #include #endif /*FAST_IPSEC*/ #ifdef IPSEC #include #include #include #endif /*IPSEC*/ #include static const int tcprexmtthresh = 3; tcp_cc tcp_ccgen; struct tcpstat tcpstat; SYSCTL_STRUCT(_net_inet_tcp, TCPCTL_STATS, stats, CTLFLAG_RW, &tcpstat , tcpstat, "TCP statistics (struct tcpstat, netinet/tcp_var.h)"); static int log_in_vain = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_in_vain, CTLFLAG_RW, &log_in_vain, 0, "Log all incoming TCP connections"); static int blackhole = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, blackhole, CTLFLAG_RW, &blackhole, 0, "Do not send RST when dropping refused connections"); int tcp_delack_enabled = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, delayed_ack, CTLFLAG_RW, &tcp_delack_enabled, 0, "Delay ACK to try and piggyback it onto a data packet"); #ifdef TCP_DROP_SYNFIN static int drop_synfin = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, drop_synfin, CTLFLAG_RW, &drop_synfin, 0, "Drop TCP packets with SYN+FIN set"); #endif static int tcp_do_rfc3042 = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3042, CTLFLAG_RW, &tcp_do_rfc3042, 0, "Enable RFC 3042 (Limited Transmit)"); static int tcp_do_rfc3390 = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3390, CTLFLAG_RW, &tcp_do_rfc3390, 0, "Enable RFC 3390 (Increasing TCP's Initial Congestion Window)"); SYSCTL_NODE(_net_inet_tcp, OID_AUTO, reass, CTLFLAG_RW, 0, "TCP Segment Reassembly Queue"); static int tcp_reass_maxseg = 0; SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, maxsegments, CTLFLAG_RDTUN, &tcp_reass_maxseg, 0, "Global maximum number of TCP Segments in Reassembly Queue"); int tcp_reass_qsize = 0; SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, cursegments, CTLFLAG_RD, &tcp_reass_qsize, 0, "Global number of TCP Segments currently in Reassembly Queue"); static int tcp_reass_maxqlen = 48; SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, maxqlen, CTLFLAG_RW, &tcp_reass_maxqlen, 0, "Maximum number of TCP Segments per individual Reassembly Queue"); static int tcp_reass_overflows = 0; SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, overflows, CTLFLAG_RD, &tcp_reass_overflows, 0, "Global number of TCP Segment Reassembly Queue Overflows"); struct inpcbhead tcb; #define tcb6 tcb /* for KAME src sync over BSD*'s */ struct inpcbinfo tcbinfo; struct mtx *tcbinfo_mtx; static void tcp_dooptions(struct tcpcb *, struct tcpopt *, u_char *, int, int, struct tcphdr *); static void tcp_pulloutofband(struct socket *, struct tcphdr *, struct mbuf *, int); static int tcp_reass(struct tcpcb *, struct tcphdr *, int *, struct mbuf *); static void tcp_xmit_timer(struct tcpcb *, int); static void tcp_newreno_partial_ack(struct tcpcb *, struct tcphdr *); static int tcp_timewait(struct tcptw *, struct tcpopt *, struct tcphdr *, struct mbuf *, int); /* Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint. */ #ifdef INET6 #define ND6_HINT(tp) \ do { \ if ((tp) && (tp)->t_inpcb && \ ((tp)->t_inpcb->inp_vflag & INP_IPV6) != 0) \ nd6_nud_hint(NULL, NULL, 0); \ } while (0) #else #define ND6_HINT(tp) #endif /* * Indicate whether this ack should be delayed. We can delay the ack if * - there is no delayed ack timer in progress and * - our last ack wasn't a 0-sized window. We never want to delay * the ack that opens up a 0-sized window and * - delayed acks are enabled or * - this is a half-synchronized T/TCP connection. */ #define DELAY_ACK(tp) \ ((!callout_active(tp->tt_delack) && \ (tp->t_flags & TF_RXWIN0SENT) == 0) && \ (tcp_delack_enabled || (tp->t_flags & TF_NEEDSYN))) /* Initialize TCP reassembly queue */ uma_zone_t tcp_reass_zone; void tcp_reass_init() { tcp_reass_maxseg = nmbclusters / 16; TUNABLE_INT_FETCH("net.inet.tcp.reass.maxsegments", &tcp_reass_maxseg); tcp_reass_zone = uma_zcreate("tcpreass", sizeof (struct tseg_qent), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); uma_zone_set_max(tcp_reass_zone, tcp_reass_maxseg); } static int tcp_reass(tp, th, tlenp, m) register struct tcpcb *tp; register struct tcphdr *th; int *tlenp; struct mbuf *m; { struct tseg_qent *q; struct tseg_qent *p = NULL; struct tseg_qent *nq; struct tseg_qent *te = NULL; struct socket *so = tp->t_inpcb->inp_socket; int flags; /* * XXX: tcp_reass() is rather inefficient with its data structures * and should be rewritten (see NetBSD for optimizations). While * doing that it should move to its own file tcp_reass.c. */ /* * Call with th==0 after become established to * force pre-ESTABLISHED data up to user socket. */ if (th == 0) goto present; /* * Limit the number of segments in the reassembly queue to prevent * holding on to too many segments (and thus running out of mbufs). * Make sure to let the missing segment through which caused this * queue. Always keep one global queue entry spare to be able to * process the missing segment. */ if (th->th_seq != tp->rcv_nxt && (tcp_reass_qsize + 1 >= tcp_reass_maxseg || tp->t_segqlen >= tcp_reass_maxqlen)) { tcp_reass_overflows++; tcpstat.tcps_rcvmemdrop++; m_freem(m); return (0); } /* * Allocate a new queue entry. If we can't, or hit the zone limit * just drop the pkt. */ te = uma_zalloc(tcp_reass_zone, M_NOWAIT); if (te == NULL) { tcpstat.tcps_rcvmemdrop++; m_freem(m); return (0); } tp->t_segqlen++; tcp_reass_qsize++; /* * Find a segment which begins after this one does. */ LIST_FOREACH(q, &tp->t_segq, tqe_q) { if (SEQ_GT(q->tqe_th->th_seq, th->th_seq)) break; p = q; } /* * If there is a preceding segment, it may provide some of * our data already. If so, drop the data from the incoming * segment. If it provides all of our data, drop us. */ if (p != NULL) { register int i; /* conversion to int (in i) handles seq wraparound */ i = p->tqe_th->th_seq + p->tqe_len - th->th_seq; if (i > 0) { if (i >= *tlenp) { tcpstat.tcps_rcvduppack++; tcpstat.tcps_rcvdupbyte += *tlenp; m_freem(m); uma_zfree(tcp_reass_zone, te); tp->t_segqlen--; tcp_reass_qsize--; /* * Try to present any queued data * at the left window edge to the user. * This is needed after the 3-WHS * completes. */ goto present; /* ??? */ } m_adj(m, i); *tlenp -= i; th->th_seq += i; } } tcpstat.tcps_rcvoopack++; tcpstat.tcps_rcvoobyte += *tlenp; /* * While we overlap succeeding segments trim them or, * if they are completely covered, dequeue them. */ while (q) { register int i = (th->th_seq + *tlenp) - q->tqe_th->th_seq; if (i <= 0) break; if (i < q->tqe_len) { q->tqe_th->th_seq += i; q->tqe_len -= i; m_adj(q->tqe_m, i); break; } nq = LIST_NEXT(q, tqe_q); LIST_REMOVE(q, tqe_q); m_freem(q->tqe_m); uma_zfree(tcp_reass_zone, q); tp->t_segqlen--; tcp_reass_qsize--; q = nq; } /* Insert the new segment queue entry into place. */ te->tqe_m = m; te->tqe_th = th; te->tqe_len = *tlenp; if (p == NULL) { LIST_INSERT_HEAD(&tp->t_segq, te, tqe_q); } else { LIST_INSERT_AFTER(p, te, tqe_q); } present: /* * Present data to user, advancing rcv_nxt through * completed sequence space. */ if (!TCPS_HAVEESTABLISHED(tp->t_state)) return (0); q = LIST_FIRST(&tp->t_segq); if (!q || q->tqe_th->th_seq != tp->rcv_nxt) return (0); + SOCKBUF_LOCK(&so->so_rcv); do { tp->rcv_nxt += q->tqe_len; flags = q->tqe_th->th_flags & TH_FIN; nq = LIST_NEXT(q, tqe_q); LIST_REMOVE(q, tqe_q); + /* Unlocked read. */ if (so->so_rcv.sb_state & SBS_CANTRCVMORE) m_freem(q->tqe_m); else - sbappendstream(&so->so_rcv, q->tqe_m); + sbappendstream_locked(&so->so_rcv, q->tqe_m); uma_zfree(tcp_reass_zone, q); tp->t_segqlen--; tcp_reass_qsize--; q = nq; } while (q && q->tqe_th->th_seq == tp->rcv_nxt); ND6_HINT(tp); - sorwakeup(so); + sorwakeup_locked(so); return (flags); } /* * TCP input routine, follows pages 65-76 of the * protocol specification dated September, 1981 very closely. */ #ifdef INET6 int tcp6_input(mp, offp, proto) struct mbuf **mp; int *offp, proto; { register struct mbuf *m = *mp; struct in6_ifaddr *ia6; IP6_EXTHDR_CHECK(m, *offp, sizeof(struct tcphdr), IPPROTO_DONE); /* * draft-itojun-ipv6-tcp-to-anycast * better place to put this in? */ ia6 = ip6_getdstifaddr(m); if (ia6 && (ia6->ia6_flags & IN6_IFF_ANYCAST)) { struct ip6_hdr *ip6; ip6 = mtod(m, struct ip6_hdr *); icmp6_error(m, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_ADDR, (caddr_t)&ip6->ip6_dst - (caddr_t)ip6); return IPPROTO_DONE; } tcp_input(m, *offp); return IPPROTO_DONE; } #endif void tcp_input(m, off0) register struct mbuf *m; int off0; { register struct tcphdr *th; register struct ip *ip = NULL; register struct ipovly *ipov; register struct inpcb *inp = NULL; u_char *optp = NULL; int optlen = 0; int len, tlen, off; int drop_hdrlen; register struct tcpcb *tp = 0; register int thflags; struct socket *so = 0; int todrop, acked, ourfinisacked, needoutput = 0; u_long tiwin; struct tcpopt to; /* options in this segment */ struct rmxp_tao tao; /* our TAO cache entry */ int headlocked = 0; struct sockaddr_in *next_hop = NULL; int rstreason; /* For badport_bandlim accounting purposes */ struct ip6_hdr *ip6 = NULL; #ifdef INET6 int isipv6; #else const int isipv6 = 0; #endif #ifdef TCPDEBUG /* * The size of tcp_saveipgen must be the size of the max ip header, * now IPv6. */ u_char tcp_saveipgen[40]; struct tcphdr tcp_savetcp; short ostate = 0; #endif /* Grab info from PACKET_TAG_IPFORWARD tag prepended to the chain. */ next_hop = m_claim_next(m, PACKET_TAG_IPFORWARD); #ifdef INET6 isipv6 = (mtod(m, struct ip *)->ip_v == 6) ? 1 : 0; #endif bzero(&tao, sizeof(tao)); bzero((char *)&to, sizeof(to)); tcpstat.tcps_rcvtotal++; if (isipv6) { #ifdef INET6 /* IP6_EXTHDR_CHECK() is already done at tcp6_input() */ ip6 = mtod(m, struct ip6_hdr *); tlen = sizeof(*ip6) + ntohs(ip6->ip6_plen) - off0; if (in6_cksum(m, IPPROTO_TCP, off0, tlen)) { tcpstat.tcps_rcvbadsum++; goto drop; } th = (struct tcphdr *)((caddr_t)ip6 + off0); /* * Be proactive about unspecified IPv6 address in source. * As we use all-zero to indicate unbounded/unconnected pcb, * unspecified IPv6 address can be used to confuse us. * * Note that packets with unspecified IPv6 destination is * already dropped in ip6_input. */ if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) { /* XXX stat */ goto drop; } #else th = NULL; /* XXX: avoid compiler warning */ #endif } else { /* * Get IP and TCP header together in first mbuf. * Note: IP leaves IP header in first mbuf. */ if (off0 > sizeof (struct ip)) { ip_stripoptions(m, (struct mbuf *)0); off0 = sizeof(struct ip); } if (m->m_len < sizeof (struct tcpiphdr)) { if ((m = m_pullup(m, sizeof (struct tcpiphdr))) == 0) { tcpstat.tcps_rcvshort++; return; } } ip = mtod(m, struct ip *); ipov = (struct ipovly *)ip; th = (struct tcphdr *)((caddr_t)ip + off0); tlen = ip->ip_len; if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) { if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) th->th_sum = m->m_pkthdr.csum_data; else th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htonl(m->m_pkthdr.csum_data + ip->ip_len + IPPROTO_TCP)); th->th_sum ^= 0xffff; #ifdef TCPDEBUG ipov->ih_len = (u_short)tlen; ipov->ih_len = htons(ipov->ih_len); #endif } else { /* * Checksum extended TCP header and data. */ len = sizeof (struct ip) + tlen; bzero(ipov->ih_x1, sizeof(ipov->ih_x1)); ipov->ih_len = (u_short)tlen; ipov->ih_len = htons(ipov->ih_len); th->th_sum = in_cksum(m, len); } if (th->th_sum) { tcpstat.tcps_rcvbadsum++; goto drop; } #ifdef INET6 /* Re-initialization for later version check */ ip->ip_v = IPVERSION; #endif } /* * Check that TCP offset makes sense, * pull out TCP options and adjust length. XXX */ off = th->th_off << 2; if (off < sizeof (struct tcphdr) || off > tlen) { tcpstat.tcps_rcvbadoff++; goto drop; } tlen -= off; /* tlen is used instead of ti->ti_len */ if (off > sizeof (struct tcphdr)) { if (isipv6) { #ifdef INET6 IP6_EXTHDR_CHECK(m, off0, off, ); ip6 = mtod(m, struct ip6_hdr *); th = (struct tcphdr *)((caddr_t)ip6 + off0); #endif } else { if (m->m_len < sizeof(struct ip) + off) { if ((m = m_pullup(m, sizeof (struct ip) + off)) == 0) { tcpstat.tcps_rcvshort++; return; } ip = mtod(m, struct ip *); ipov = (struct ipovly *)ip; th = (struct tcphdr *)((caddr_t)ip + off0); } } optlen = off - sizeof (struct tcphdr); optp = (u_char *)(th + 1); } thflags = th->th_flags; #ifdef TCP_DROP_SYNFIN /* * If the drop_synfin option is enabled, drop all packets with * both the SYN and FIN bits set. This prevents e.g. nmap from * identifying the TCP/IP stack. * * This is a violation of the TCP specification. */ if (drop_synfin && (thflags & (TH_SYN|TH_FIN)) == (TH_SYN|TH_FIN)) goto drop; #endif /* * Convert TCP protocol specific fields to host format. */ th->th_seq = ntohl(th->th_seq); th->th_ack = ntohl(th->th_ack); th->th_win = ntohs(th->th_win); th->th_urp = ntohs(th->th_urp); /* * Delay dropping TCP, IP headers, IPv6 ext headers, and TCP options, * until after ip6_savecontrol() is called and before other functions * which don't want those proto headers. * Because ip6_savecontrol() is going to parse the mbuf to * search for data to be passed up to user-land, it wants mbuf * parameters to be unchanged. * XXX: the call of ip6_savecontrol() has been obsoleted based on * latest version of the advanced API (20020110). */ drop_hdrlen = off0 + off; /* * Locate pcb for segment. */ INP_INFO_WLOCK(&tcbinfo); headlocked = 1; findpcb: /* IPFIREWALL_FORWARD section */ if (next_hop != NULL && isipv6 == 0) { /* IPv6 support is not yet */ /* * Transparently forwarded. Pretend to be the destination. * already got one like this? */ inp = in_pcblookup_hash(&tcbinfo, ip->ip_src, th->th_sport, ip->ip_dst, th->th_dport, 0, m->m_pkthdr.rcvif); if (!inp) { /* It's new. Try find the ambushing socket. */ inp = in_pcblookup_hash(&tcbinfo, ip->ip_src, th->th_sport, next_hop->sin_addr, next_hop->sin_port ? ntohs(next_hop->sin_port) : th->th_dport, 1, m->m_pkthdr.rcvif); } } else { if (isipv6) { #ifdef INET6 inp = in6_pcblookup_hash(&tcbinfo, &ip6->ip6_src, th->th_sport, &ip6->ip6_dst, th->th_dport, 1, m->m_pkthdr.rcvif); #endif } else inp = in_pcblookup_hash(&tcbinfo, ip->ip_src, th->th_sport, ip->ip_dst, th->th_dport, 1, m->m_pkthdr.rcvif); } #if defined(IPSEC) || defined(FAST_IPSEC) #ifdef INET6 if (isipv6) { if (inp != NULL && ipsec6_in_reject(m, inp)) { #ifdef IPSEC ipsec6stat.in_polvio++; #endif goto drop; } } else #endif /* INET6 */ if (inp != NULL && ipsec4_in_reject(m, inp)) { #ifdef IPSEC ipsecstat.in_polvio++; #endif goto drop; } #endif /*IPSEC || FAST_IPSEC*/ /* * If the state is CLOSED (i.e., TCB does not exist) then * all data in the incoming segment is discarded. * If the TCB exists but is in CLOSED state, it is embryonic, * but should either do a listen or a connect soon. */ if (inp == NULL) { if (log_in_vain) { #ifdef INET6 char dbuf[INET6_ADDRSTRLEN+2], sbuf[INET6_ADDRSTRLEN+2]; #else char dbuf[4*sizeof "123"], sbuf[4*sizeof "123"]; #endif if (isipv6) { #ifdef INET6 strcpy(dbuf, "["); strcpy(sbuf, "["); strcat(dbuf, ip6_sprintf(&ip6->ip6_dst)); strcat(sbuf, ip6_sprintf(&ip6->ip6_src)); strcat(dbuf, "]"); strcat(sbuf, "]"); #endif } else { strcpy(dbuf, inet_ntoa(ip->ip_dst)); strcpy(sbuf, inet_ntoa(ip->ip_src)); } switch (log_in_vain) { case 1: if ((thflags & TH_SYN) == 0) break; /* FALLTHROUGH */ case 2: log(LOG_INFO, "Connection attempt to TCP %s:%d " "from %s:%d flags:0x%02x\n", dbuf, ntohs(th->th_dport), sbuf, ntohs(th->th_sport), thflags); break; default: break; } } if (blackhole) { switch (blackhole) { case 1: if (thflags & TH_SYN) goto drop; break; case 2: goto drop; default: goto drop; } } rstreason = BANDLIM_RST_CLOSEDPORT; goto dropwithreset; } INP_LOCK(inp); if (inp->inp_vflag & INP_TIMEWAIT) { /* * The only option of relevance is TOF_CC, and only if * present in a SYN segment. See tcp_timewait(). */ if (thflags & TH_SYN) tcp_dooptions((struct tcpcb *)NULL, &to, optp, optlen, 1, th); if (tcp_timewait((struct tcptw *)inp->inp_ppcb, &to, th, m, tlen)) goto findpcb; /* * tcp_timewait unlocks inp. */ INP_INFO_WUNLOCK(&tcbinfo); return; } tp = intotcpcb(inp); if (tp == 0) { INP_UNLOCK(inp); rstreason = BANDLIM_RST_CLOSEDPORT; goto dropwithreset; } if (tp->t_state == TCPS_CLOSED) goto drop; /* Unscale the window into a 32-bit value. */ if ((thflags & TH_SYN) == 0) tiwin = th->th_win << tp->snd_scale; else tiwin = th->th_win; #ifdef MAC INP_LOCK_ASSERT(inp); if (mac_check_inpcb_deliver(inp, m)) goto drop; #endif so = inp->inp_socket; #ifdef TCPDEBUG if (so->so_options & SO_DEBUG) { ostate = tp->t_state; if (isipv6) bcopy((char *)ip6, (char *)tcp_saveipgen, sizeof(*ip6)); else bcopy((char *)ip, (char *)tcp_saveipgen, sizeof(*ip)); tcp_savetcp = *th; } #endif if (so->so_options & SO_ACCEPTCONN) { struct in_conninfo inc; #ifdef INET6 inc.inc_isipv6 = isipv6; #endif if (isipv6) { inc.inc6_faddr = ip6->ip6_src; inc.inc6_laddr = ip6->ip6_dst; } else { inc.inc_faddr = ip->ip_src; inc.inc_laddr = ip->ip_dst; } inc.inc_fport = th->th_sport; inc.inc_lport = th->th_dport; /* * If the state is LISTEN then ignore segment if it contains * a RST. If the segment contains an ACK then it is bad and * send a RST. If it does not contain a SYN then it is not * interesting; drop it. * * If the state is SYN_RECEIVED (syncache) and seg contains * an ACK, but not for our SYN/ACK, send a RST. If the seg * contains a RST, check the sequence number to see if it * is a valid reset segment. */ if ((thflags & (TH_RST|TH_ACK|TH_SYN)) != TH_SYN) { if ((thflags & (TH_RST|TH_ACK|TH_SYN)) == TH_ACK) { if (!syncache_expand(&inc, th, &so, m)) { /* * No syncache entry, or ACK was not * for our SYN/ACK. Send a RST. */ tcpstat.tcps_badsyn++; rstreason = BANDLIM_RST_OPENPORT; goto dropwithreset; } if (so == NULL) { /* * Could not complete 3-way handshake, * connection is being closed down, and * syncache will free mbuf. */ INP_UNLOCK(inp); INP_INFO_WUNLOCK(&tcbinfo); return; } /* * Socket is created in state SYN_RECEIVED. * Continue processing segment. */ INP_UNLOCK(inp); inp = sotoinpcb(so); INP_LOCK(inp); tp = intotcpcb(inp); /* * This is what would have happened in * tcp_output() when the SYN,ACK was sent. */ tp->snd_up = tp->snd_una; tp->snd_max = tp->snd_nxt = tp->iss + 1; tp->last_ack_sent = tp->rcv_nxt; /* * RFC1323: The window in SYN & SYN/ACK * segments is never scaled. */ tp->snd_wnd = tiwin; /* unscaled */ goto after_listen; } if (thflags & TH_RST) { syncache_chkrst(&inc, th); goto drop; } if (thflags & TH_ACK) { syncache_badack(&inc); tcpstat.tcps_badsyn++; rstreason = BANDLIM_RST_OPENPORT; goto dropwithreset; } goto drop; } /* * Segment's flags are (SYN) or (SYN|FIN). */ #ifdef INET6 /* * If deprecated address is forbidden, * we do not accept SYN to deprecated interface * address to prevent any new inbound connection from * getting established. * When we do not accept SYN, we send a TCP RST, * with deprecated source address (instead of dropping * it). We compromise it as it is much better for peer * to send a RST, and RST will be the final packet * for the exchange. * * If we do not forbid deprecated addresses, we accept * the SYN packet. RFC2462 does not suggest dropping * SYN in this case. * If we decipher RFC2462 5.5.4, it says like this: * 1. use of deprecated addr with existing * communication is okay - "SHOULD continue to be * used" * 2. use of it with new communication: * (2a) "SHOULD NOT be used if alternate address * with sufficient scope is available" * (2b) nothing mentioned otherwise. * Here we fall into (2b) case as we have no choice in * our source address selection - we must obey the peer. * * The wording in RFC2462 is confusing, and there are * multiple description text for deprecated address * handling - worse, they are not exactly the same. * I believe 5.5.4 is the best one, so we follow 5.5.4. */ if (isipv6 && !ip6_use_deprecated) { struct in6_ifaddr *ia6; if ((ia6 = ip6_getdstifaddr(m)) && (ia6->ia6_flags & IN6_IFF_DEPRECATED)) { INP_UNLOCK(inp); tp = NULL; rstreason = BANDLIM_RST_OPENPORT; goto dropwithreset; } } #endif /* * If it is from this socket, drop it, it must be forged. * Don't bother responding if the destination was a broadcast. */ if (th->th_dport == th->th_sport) { if (isipv6) { if (IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, &ip6->ip6_src)) goto drop; } else { if (ip->ip_dst.s_addr == ip->ip_src.s_addr) goto drop; } } /* * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN * * Note that it is quite possible to receive unicast * link-layer packets with a broadcast IP address. Use * in_broadcast() to find them. */ if (m->m_flags & (M_BCAST|M_MCAST)) goto drop; if (isipv6) { if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) || IN6_IS_ADDR_MULTICAST(&ip6->ip6_src)) goto drop; } else { if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) || IN_MULTICAST(ntohl(ip->ip_src.s_addr)) || ip->ip_src.s_addr == htonl(INADDR_BROADCAST) || in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) goto drop; } /* * SYN appears to be valid; create compressed TCP state * for syncache, or perform t/tcp connection. */ if (so->so_qlen <= so->so_qlimit) { #ifdef TCPDEBUG if (so->so_options & SO_DEBUG) tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif tcp_dooptions(tp, &to, optp, optlen, 1, th); if (!syncache_add(&inc, &to, th, &so, m)) goto drop; if (so == NULL) { /* * Entry added to syncache, mbuf used to * send SYN,ACK packet. */ KASSERT(headlocked, ("headlocked")); INP_UNLOCK(inp); INP_INFO_WUNLOCK(&tcbinfo); return; } /* * Segment passed TAO tests. */ INP_UNLOCK(inp); inp = sotoinpcb(so); INP_LOCK(inp); tp = intotcpcb(inp); tp->snd_wnd = tiwin; tp->t_starttime = ticks; tp->t_state = TCPS_ESTABLISHED; /* * T/TCP logic: * If there is a FIN or if there is data, then * delay SYN,ACK(SYN) in the hope of piggy-backing * it on a response segment. Otherwise must send * ACK now in case the other side is slow starting. */ if (thflags & TH_FIN || tlen != 0) tp->t_flags |= (TF_DELACK | TF_NEEDSYN); else tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN); tcpstat.tcps_connects++; soisconnected(so); goto trimthenstep6; } goto drop; } after_listen: /* XXX temp debugging */ /* should not happen - syncache should pick up these connections */ if (tp->t_state == TCPS_LISTEN) panic("tcp_input: TCPS_LISTEN"); /* * This is the second part of the MSS DoS prevention code (after * minmss on the sending side) and it deals with too many too small * tcp packets in a too short timeframe (1 second). * * For every full second we count the number of received packets * and bytes. If we get a lot of packets per second for this connection * (tcp_minmssoverload) we take a closer look at it and compute the * average packet size for the past second. If that is less than * tcp_minmss we get too many packets with very small payload which * is not good and burdens our system (and every packet generates * a wakeup to the process connected to our socket). We can reasonable * expect this to be small packet DoS attack to exhaust our CPU * cycles. * * Care has to be taken for the minimum packet overload value. This * value defines the minimum number of packets per second before we * start to worry. This must not be too low to avoid killing for * example interactive connections with many small packets like * telnet or SSH. * * Setting either tcp_minmssoverload or tcp_minmss to "0" disables * this check. * * Account for packet if payload packet, skip over ACK, etc. */ if (tcp_minmss && tcp_minmssoverload && tp->t_state == TCPS_ESTABLISHED && tlen > 0) { if (tp->rcv_second > ticks) { tp->rcv_pps++; tp->rcv_byps += tlen + off; if (tp->rcv_pps > tcp_minmssoverload) { if ((tp->rcv_byps / tp->rcv_pps) < tcp_minmss) { printf("too many small tcp packets from " "%s:%u, av. %lubyte/packet, " "dropping connection\n", #ifdef INET6 isipv6 ? ip6_sprintf(&inp->inp_inc.inc6_faddr) : #endif inet_ntoa(inp->inp_inc.inc_faddr), inp->inp_inc.inc_fport, tp->rcv_byps / tp->rcv_pps); tp = tcp_drop(tp, ECONNRESET); tcpstat.tcps_minmssdrops++; goto drop; } } } else { tp->rcv_second = ticks + hz; tp->rcv_pps = 1; tp->rcv_byps = tlen + off; } } /* * Segment received on connection. * Reset idle time and keep-alive timer. */ tp->t_rcvtime = ticks; if (TCPS_HAVEESTABLISHED(tp->t_state)) callout_reset(tp->tt_keep, tcp_keepidle, tcp_timer_keep, tp); /* * Process options only when we get SYN/ACK back. The SYN case * for incoming connections is handled in tcp_syncache. * XXX this is traditional behavior, may need to be cleaned up. */ tcp_dooptions(tp, &to, optp, optlen, thflags & TH_SYN, th); if (thflags & TH_SYN) { if (to.to_flags & TOF_SCALE) { tp->t_flags |= TF_RCVD_SCALE; tp->requested_s_scale = to.to_requested_s_scale; } if (to.to_flags & TOF_TS) { tp->t_flags |= TF_RCVD_TSTMP; tp->ts_recent = to.to_tsval; tp->ts_recent_age = ticks; } if (to.to_flags & (TOF_CC|TOF_CCNEW)) tp->t_flags |= TF_RCVD_CC; if (to.to_flags & TOF_MSS) tcp_mss(tp, to.to_mss); if (tp->sack_enable) { if (!(to.to_flags & TOF_SACK)) tp->sack_enable = 0; else tp->t_flags |= TF_SACK_PERMIT; } } if (tp->sack_enable) { /* Delete stale (cumulatively acked) SACK holes */ tcp_del_sackholes(tp, th); tp->rcv_laststart = th->th_seq; /* last recv'd segment*/ tp->rcv_lastend = th->th_seq + tlen; } /* * Header prediction: check for the two common cases * of a uni-directional data xfer. If the packet has * no control flags, is in-sequence, the window didn't * change and we're not retransmitting, it's a * candidate. If the length is zero and the ack moved * forward, we're the sender side of the xfer. Just * free the data acked & wake any higher level process * that was blocked waiting for space. If the length * is non-zero and the ack didn't move, we're the * receiver side. If we're getting packets in-order * (the reassembly queue is empty), add the data to * the socket buffer and note that we need a delayed ack. * Make sure that the hidden state-flags are also off. * Since we check for TCPS_ESTABLISHED above, it can only * be TH_NEEDSYN. */ if (tp->t_state == TCPS_ESTABLISHED && (thflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK && ((tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN)) == 0) && ((to.to_flags & TOF_TS) == 0 || TSTMP_GEQ(to.to_tsval, tp->ts_recent)) && /* * Using the CC option is compulsory if once started: * the segment is OK if no T/TCP was negotiated or * if the segment has a CC option equal to CCrecv */ ((tp->t_flags & (TF_REQ_CC|TF_RCVD_CC)) != (TF_REQ_CC|TF_RCVD_CC) || ((to.to_flags & TOF_CC) != 0 && to.to_cc == tp->cc_recv)) && th->th_seq == tp->rcv_nxt && tiwin && tiwin == tp->snd_wnd && tp->snd_nxt == tp->snd_max) { /* * If last ACK falls within this segment's sequence numbers, * record the timestamp. * NOTE that the test is modified according to the latest * proposal of the tcplw@cray.com list (Braden 1993/04/26). */ if ((to.to_flags & TOF_TS) != 0 && SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { tp->ts_recent_age = ticks; tp->ts_recent = to.to_tsval; } if (tlen == 0) { if (SEQ_GT(th->th_ack, tp->snd_una) && SEQ_LEQ(th->th_ack, tp->snd_max) && tp->snd_cwnd >= tp->snd_wnd && ((!tcp_do_newreno && !tp->sack_enable && tp->t_dupacks < tcprexmtthresh) || ((tcp_do_newreno || tp->sack_enable) && !IN_FASTRECOVERY(tp)))) { KASSERT(headlocked, ("headlocked")); INP_INFO_WUNLOCK(&tcbinfo); /* * this is a pure ack for outstanding data. */ ++tcpstat.tcps_predack; /* * "bad retransmit" recovery */ if (tp->t_rxtshift == 1 && ticks < tp->t_badrxtwin) { ++tcpstat.tcps_sndrexmitbad; tp->snd_cwnd = tp->snd_cwnd_prev; tp->snd_ssthresh = tp->snd_ssthresh_prev; tp->snd_recover = tp->snd_recover_prev; if (tp->t_flags & TF_WASFRECOVERY) ENTER_FASTRECOVERY(tp); tp->snd_nxt = tp->snd_max; tp->t_badrxtwin = 0; } /* * Recalculate the transmit timer / rtt. * * Some boxes send broken timestamp replies * during the SYN+ACK phase, ignore * timestamps of 0 or we could calculate a * huge RTT and blow up the retransmit timer. */ if ((to.to_flags & TOF_TS) != 0 && to.to_tsecr) { tcp_xmit_timer(tp, ticks - to.to_tsecr + 1); } else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) { tcp_xmit_timer(tp, ticks - tp->t_rtttime); } tcp_xmit_bandwidth_limit(tp, th->th_ack); acked = th->th_ack - tp->snd_una; tcpstat.tcps_rcvackpack++; tcpstat.tcps_rcvackbyte += acked; sbdrop(&so->so_snd, acked); if (SEQ_GT(tp->snd_una, tp->snd_recover) && SEQ_LEQ(th->th_ack, tp->snd_recover)) tp->snd_recover = th->th_ack - 1; tp->snd_una = th->th_ack; /* * pull snd_wl2 up to prevent seq wrap relative * to th_ack. */ tp->snd_wl2 = th->th_ack; tp->t_dupacks = 0; m_freem(m); ND6_HINT(tp); /* some progress has been done */ /* * If all outstanding data are acked, stop * retransmit timer, otherwise restart timer * using current (possibly backed-off) value. * If process is waiting for space, * wakeup/selwakeup/signal. If data * are ready to send, let tcp_output * decide between more output or persist. #ifdef TCPDEBUG if (so->so_options & SO_DEBUG) tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif */ if (tp->snd_una == tp->snd_max) callout_stop(tp->tt_rexmt); else if (!callout_active(tp->tt_persist)) callout_reset(tp->tt_rexmt, tp->t_rxtcur, tcp_timer_rexmt, tp); sowwakeup(so); if (so->so_snd.sb_cc) (void) tcp_output(tp); goto check_delack; } } else if (th->th_ack == tp->snd_una && LIST_EMPTY(&tp->t_segq) && tlen <= sbspace(&so->so_rcv)) { KASSERT(headlocked, ("headlocked")); INP_INFO_WUNLOCK(&tcbinfo); /* * this is a pure, in-sequence data packet * with nothing on the reassembly queue and * we have enough buffer space to take it. */ /* Clean receiver SACK report if present */ if (tp->sack_enable && tp->rcv_numsacks) tcp_clean_sackreport(tp); ++tcpstat.tcps_preddat; tp->rcv_nxt += tlen; /* * Pull snd_wl1 up to prevent seq wrap relative to * th_seq. */ tp->snd_wl1 = th->th_seq; /* * Pull rcv_up up to prevent seq wrap relative to * rcv_nxt. */ tp->rcv_up = tp->rcv_nxt; tcpstat.tcps_rcvpack++; tcpstat.tcps_rcvbyte += tlen; ND6_HINT(tp); /* some progress has been done */ /* #ifdef TCPDEBUG if (so->so_options & SO_DEBUG) tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif * Add data to socket buffer. */ + /* Unlocked read. */ + SOCKBUF_LOCK(&so->so_rcv); if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { m_freem(m); } else { m_adj(m, drop_hdrlen); /* delayed header drop */ - sbappendstream(&so->so_rcv, m); + sbappendstream_locked(&so->so_rcv, m); } - sorwakeup(so); + sorwakeup_locked(so); if (DELAY_ACK(tp)) { tp->t_flags |= TF_DELACK; } else { tp->t_flags |= TF_ACKNOW; tcp_output(tp); } goto check_delack; } } /* * Calculate amount of space in receive window, * and then do TCP input processing. * Receive window is amount of space in rcv queue, * but not less than advertised window. */ { int win; win = sbspace(&so->so_rcv); if (win < 0) win = 0; tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt)); } switch (tp->t_state) { /* * If the state is SYN_RECEIVED: * if seg contains an ACK, but not for our SYN/ACK, send a RST. */ case TCPS_SYN_RECEIVED: if ((thflags & TH_ACK) && (SEQ_LEQ(th->th_ack, tp->snd_una) || SEQ_GT(th->th_ack, tp->snd_max))) { rstreason = BANDLIM_RST_OPENPORT; goto dropwithreset; } break; /* * If the state is SYN_SENT: * if seg contains an ACK, but not for our SYN, drop the input. * if seg contains a RST, then drop the connection. * if seg does not contain SYN, then drop it. * Otherwise this is an acceptable SYN segment * initialize tp->rcv_nxt and tp->irs * if seg contains ack then advance tp->snd_una * if SYN has been acked change to ESTABLISHED else SYN_RCVD state * arrange for segment to be acked (eventually) * continue processing rest of data/controls, beginning with URG */ case TCPS_SYN_SENT: if (tcp_do_rfc1644) tcp_hc_gettao(&inp->inp_inc, &tao); if ((thflags & TH_ACK) && (SEQ_LEQ(th->th_ack, tp->iss) || SEQ_GT(th->th_ack, tp->snd_max))) { /* * If we have a cached CCsent for the remote host, * hence we haven't just crashed and restarted, * do not send a RST. This may be a retransmission * from the other side after our earlier ACK was lost. * Our new SYN, when it arrives, will serve as the * needed ACK. */ if (tao.tao_ccsent != 0) goto drop; else { rstreason = BANDLIM_UNLIMITED; goto dropwithreset; } } if (thflags & TH_RST) { if (thflags & TH_ACK) tp = tcp_drop(tp, ECONNREFUSED); goto drop; } if ((thflags & TH_SYN) == 0) goto drop; tp->snd_wnd = th->th_win; /* initial send window */ tp->cc_recv = to.to_cc; /* foreign CC */ tp->irs = th->th_seq; tcp_rcvseqinit(tp); if (thflags & TH_ACK) { /* * Our SYN was acked. If segment contains CC.ECHO * option, check it to make sure this segment really * matches our SYN. If not, just drop it as old * duplicate, but send an RST if we're still playing * by the old rules. If no CC.ECHO option, make sure * we don't get fooled into using T/TCP. */ if (to.to_flags & TOF_CCECHO) { if (tp->cc_send != to.to_ccecho) { if (tao.tao_ccsent != 0) goto drop; else { rstreason = BANDLIM_UNLIMITED; goto dropwithreset; } } } else tp->t_flags &= ~TF_RCVD_CC; tcpstat.tcps_connects++; soisconnected(so); #ifdef MAC SOCK_LOCK(so); mac_set_socket_peer_from_mbuf(m, so); SOCK_UNLOCK(so); #endif /* Do window scaling on this connection? */ if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == (TF_RCVD_SCALE|TF_REQ_SCALE)) { tp->snd_scale = tp->requested_s_scale; tp->rcv_scale = tp->request_r_scale; } /* Segment is acceptable, update cache if undefined. */ if (tao.tao_ccsent == 0 && tcp_do_rfc1644) tcp_hc_updatetao(&inp->inp_inc, TCP_HC_TAO_CCSENT, to.to_ccecho, 0); tp->rcv_adv += tp->rcv_wnd; tp->snd_una++; /* SYN is acked */ /* * If there's data, delay ACK; if there's also a FIN * ACKNOW will be turned on later. */ if (DELAY_ACK(tp) && tlen != 0) callout_reset(tp->tt_delack, tcp_delacktime, tcp_timer_delack, tp); else tp->t_flags |= TF_ACKNOW; /* * Received in SYN_SENT[*] state. * Transitions: * SYN_SENT --> ESTABLISHED * SYN_SENT* --> FIN_WAIT_1 */ tp->t_starttime = ticks; if (tp->t_flags & TF_NEEDFIN) { tp->t_state = TCPS_FIN_WAIT_1; tp->t_flags &= ~TF_NEEDFIN; thflags &= ~TH_SYN; } else { tp->t_state = TCPS_ESTABLISHED; callout_reset(tp->tt_keep, tcp_keepidle, tcp_timer_keep, tp); } } else { /* * Received initial SYN in SYN-SENT[*] state => * simultaneous open. If segment contains CC option * and there is a cached CC, apply TAO test. * If it succeeds, connection is * half-synchronized. * Otherwise, do 3-way handshake: * SYN-SENT -> SYN-RECEIVED * SYN-SENT* -> SYN-RECEIVED* * If there was no CC option, clear cached CC value. */ tp->t_flags |= TF_ACKNOW; callout_stop(tp->tt_rexmt); if (to.to_flags & TOF_CC) { if (tao.tao_cc != 0 && CC_GT(to.to_cc, tao.tao_cc)) { /* * update cache and make transition: * SYN-SENT -> ESTABLISHED* * SYN-SENT* -> FIN-WAIT-1* */ tao.tao_cc = to.to_cc; tcp_hc_updatetao(&inp->inp_inc, TCP_HC_TAO_CC, to.to_cc, 0); tp->t_starttime = ticks; if (tp->t_flags & TF_NEEDFIN) { tp->t_state = TCPS_FIN_WAIT_1; tp->t_flags &= ~TF_NEEDFIN; } else { tp->t_state = TCPS_ESTABLISHED; callout_reset(tp->tt_keep, tcp_keepidle, tcp_timer_keep, tp); } tp->t_flags |= TF_NEEDSYN; } else tp->t_state = TCPS_SYN_RECEIVED; } else { if (tcp_do_rfc1644) { /* CC.NEW or no option => invalidate cache */ tao.tao_cc = 0; tcp_hc_updatetao(&inp->inp_inc, TCP_HC_TAO_CC, to.to_cc, 0); } tp->t_state = TCPS_SYN_RECEIVED; } } trimthenstep6: /* * Advance th->th_seq to correspond to first data byte. * If data, trim to stay within window, * dropping FIN if necessary. */ th->th_seq++; if (tlen > tp->rcv_wnd) { todrop = tlen - tp->rcv_wnd; m_adj(m, -todrop); tlen = tp->rcv_wnd; thflags &= ~TH_FIN; tcpstat.tcps_rcvpackafterwin++; tcpstat.tcps_rcvbyteafterwin += todrop; } tp->snd_wl1 = th->th_seq - 1; tp->rcv_up = th->th_seq; /* * Client side of transaction: already sent SYN and data. * If the remote host used T/TCP to validate the SYN, * our data will be ACK'd; if so, enter normal data segment * processing in the middle of step 5, ack processing. * Otherwise, goto step 6. */ if (thflags & TH_ACK) goto process_ACK; goto step6; /* * If the state is LAST_ACK or CLOSING or TIME_WAIT: * if segment contains a SYN and CC [not CC.NEW] option: * if state == TIME_WAIT and connection duration > MSL, * drop packet and send RST; * * if SEG.CC > CCrecv then is new SYN, and can implicitly * ack the FIN (and data) in retransmission queue. * Complete close and delete TCPCB. Then reprocess * segment, hoping to find new TCPCB in LISTEN state; * * else must be old SYN; drop it. * else do normal processing. */ case TCPS_LAST_ACK: case TCPS_CLOSING: case TCPS_TIME_WAIT: KASSERT(tp->t_state != TCPS_TIME_WAIT, ("timewait")); if ((thflags & TH_SYN) && (to.to_flags & TOF_CC) && tp->cc_recv != 0) { if (tp->t_state == TCPS_TIME_WAIT && (ticks - tp->t_starttime) > tcp_msl) { rstreason = BANDLIM_UNLIMITED; goto dropwithreset; } if (CC_GT(to.to_cc, tp->cc_recv)) { tp = tcp_close(tp); goto findpcb; } else goto drop; } break; /* continue normal processing */ } /* * States other than LISTEN or SYN_SENT. * First check the RST flag and sequence number since reset segments * are exempt from the timestamp and connection count tests. This * fixes a bug introduced by the Stevens, vol. 2, p. 960 bugfix * below which allowed reset segments in half the sequence space * to fall though and be processed (which gives forged reset * segments with a random sequence number a 50 percent chance of * killing a connection). * Then check timestamp, if present. * Then check the connection count, if present. * Then check that at least some bytes of segment are within * receive window. If segment begins before rcv_nxt, * drop leading data (and SYN); if nothing left, just ack. * * * If the RST bit is set, check the sequence number to see * if this is a valid reset segment. * RFC 793 page 37: * In all states except SYN-SENT, all reset (RST) segments * are validated by checking their SEQ-fields. A reset is * valid if its sequence number is in the window. * Note: this does not take into account delayed ACKs, so * we should test against last_ack_sent instead of rcv_nxt. * The sequence number in the reset segment is normally an * echo of our outgoing acknowlegement numbers, but some hosts * send a reset with the sequence number at the rightmost edge * of our receive window, and we have to handle this case. * Note 2: Paul Watson's paper "Slipping in the Window" has shown * that brute force RST attacks are possible. To combat this, * we use a much stricter check while in the ESTABLISHED state, * only accepting RSTs where the sequence number is equal to * last_ack_sent. In all other states (the states in which a * RST is more likely), the more permissive check is used. * If we have multiple segments in flight, the intial reset * segment sequence numbers will be to the left of last_ack_sent, * but they will eventually catch up. * In any case, it never made sense to trim reset segments to * fit the receive window since RFC 1122 says: * 4.2.2.12 RST Segment: RFC-793 Section 3.4 * * A TCP SHOULD allow a received RST segment to include data. * * DISCUSSION * It has been suggested that a RST segment could contain * ASCII text that encoded and explained the cause of the * RST. No standard has yet been established for such * data. * * If the reset segment passes the sequence number test examine * the state: * SYN_RECEIVED STATE: * If passive open, return to LISTEN state. * If active open, inform user that connection was refused. * ESTABLISHED, FIN_WAIT_1, FIN_WAIT_2, CLOSE_WAIT STATES: * Inform user that connection was reset, and close tcb. * CLOSING, LAST_ACK STATES: * Close the tcb. * TIME_WAIT STATE: * Drop the segment - see Stevens, vol. 2, p. 964 and * RFC 1337. */ if (thflags & TH_RST) { if (SEQ_GEQ(th->th_seq, tp->last_ack_sent) && SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) { switch (tp->t_state) { case TCPS_SYN_RECEIVED: so->so_error = ECONNREFUSED; goto close; case TCPS_ESTABLISHED: if (tp->last_ack_sent != th->th_seq) { tcpstat.tcps_badrst++; goto drop; } case TCPS_FIN_WAIT_1: case TCPS_FIN_WAIT_2: case TCPS_CLOSE_WAIT: so->so_error = ECONNRESET; close: tp->t_state = TCPS_CLOSED; tcpstat.tcps_drops++; tp = tcp_close(tp); break; case TCPS_CLOSING: case TCPS_LAST_ACK: tp = tcp_close(tp); break; case TCPS_TIME_WAIT: KASSERT(tp->t_state != TCPS_TIME_WAIT, ("timewait")); break; } } goto drop; } /* * RFC 1323 PAWS: If we have a timestamp reply on this segment * and it's less than ts_recent, drop it. */ if ((to.to_flags & TOF_TS) != 0 && tp->ts_recent && TSTMP_LT(to.to_tsval, tp->ts_recent)) { /* Check to see if ts_recent is over 24 days old. */ if ((int)(ticks - tp->ts_recent_age) > TCP_PAWS_IDLE) { /* * Invalidate ts_recent. If this segment updates * ts_recent, the age will be reset later and ts_recent * will get a valid value. If it does not, setting * ts_recent to zero will at least satisfy the * requirement that zero be placed in the timestamp * echo reply when ts_recent isn't valid. The * age isn't reset until we get a valid ts_recent * because we don't want out-of-order segments to be * dropped when ts_recent is old. */ tp->ts_recent = 0; } else { tcpstat.tcps_rcvduppack++; tcpstat.tcps_rcvdupbyte += tlen; tcpstat.tcps_pawsdrop++; if (tlen) goto dropafterack; goto drop; } } /* * T/TCP mechanism * If T/TCP was negotiated and the segment doesn't have CC, * or if its CC is wrong then drop the segment. * RST segments do not have to comply with this. */ if ((tp->t_flags & (TF_REQ_CC|TF_RCVD_CC)) == (TF_REQ_CC|TF_RCVD_CC) && ((to.to_flags & TOF_CC) == 0 || tp->cc_recv != to.to_cc)) goto dropafterack; /* * In the SYN-RECEIVED state, validate that the packet belongs to * this connection before trimming the data to fit the receive * window. Check the sequence number versus IRS since we know * the sequence numbers haven't wrapped. This is a partial fix * for the "LAND" DoS attack. */ if (tp->t_state == TCPS_SYN_RECEIVED && SEQ_LT(th->th_seq, tp->irs)) { rstreason = BANDLIM_RST_OPENPORT; goto dropwithreset; } todrop = tp->rcv_nxt - th->th_seq; if (todrop > 0) { if (thflags & TH_SYN) { thflags &= ~TH_SYN; th->th_seq++; if (th->th_urp > 1) th->th_urp--; else thflags &= ~TH_URG; todrop--; } /* * Following if statement from Stevens, vol. 2, p. 960. */ if (todrop > tlen || (todrop == tlen && (thflags & TH_FIN) == 0)) { /* * Any valid FIN must be to the left of the window. * At this point the FIN must be a duplicate or out * of sequence; drop it. */ thflags &= ~TH_FIN; /* * Send an ACK to resynchronize and drop any data. * But keep on processing for RST or ACK. */ tp->t_flags |= TF_ACKNOW; todrop = tlen; tcpstat.tcps_rcvduppack++; tcpstat.tcps_rcvdupbyte += todrop; } else { tcpstat.tcps_rcvpartduppack++; tcpstat.tcps_rcvpartdupbyte += todrop; } drop_hdrlen += todrop; /* drop from the top afterwards */ th->th_seq += todrop; tlen -= todrop; if (th->th_urp > todrop) th->th_urp -= todrop; else { thflags &= ~TH_URG; th->th_urp = 0; } } /* * If new data are received on a connection after the * user processes are gone, then RST the other end. */ if ((so->so_state & SS_NOFDREF) && tp->t_state > TCPS_CLOSE_WAIT && tlen) { tp = tcp_close(tp); tcpstat.tcps_rcvafterclose++; rstreason = BANDLIM_UNLIMITED; goto dropwithreset; } /* * If segment ends after window, drop trailing data * (and PUSH and FIN); if nothing left, just ACK. */ todrop = (th->th_seq+tlen) - (tp->rcv_nxt+tp->rcv_wnd); if (todrop > 0) { tcpstat.tcps_rcvpackafterwin++; if (todrop >= tlen) { tcpstat.tcps_rcvbyteafterwin += tlen; /* * If a new connection request is received * while in TIME_WAIT, drop the old connection * and start over if the sequence numbers * are above the previous ones. */ KASSERT(tp->t_state != TCPS_TIME_WAIT, ("timewait")); if (thflags & TH_SYN && tp->t_state == TCPS_TIME_WAIT && SEQ_GT(th->th_seq, tp->rcv_nxt)) { tp = tcp_close(tp); goto findpcb; } /* * If window is closed can only take segments at * window edge, and have to drop data and PUSH from * incoming segments. Continue processing, but * remember to ack. Otherwise, drop segment * and ack. */ if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) { tp->t_flags |= TF_ACKNOW; tcpstat.tcps_rcvwinprobe++; } else goto dropafterack; } else tcpstat.tcps_rcvbyteafterwin += todrop; m_adj(m, -todrop); tlen -= todrop; thflags &= ~(TH_PUSH|TH_FIN); } /* * If last ACK falls within this segment's sequence numbers, * record its timestamp. * NOTE that the test is modified according to the latest * proposal of the tcplw@cray.com list (Braden 1993/04/26). */ if ((to.to_flags & TOF_TS) != 0 && SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { tp->ts_recent_age = ticks; tp->ts_recent = to.to_tsval; } /* * If a SYN is in the window, then this is an * error and we send an RST and drop the connection. */ if (thflags & TH_SYN) { tp = tcp_drop(tp, ECONNRESET); rstreason = BANDLIM_UNLIMITED; goto drop; } /* * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN * flag is on (half-synchronized state), then queue data for * later processing; else drop segment and return. */ if ((thflags & TH_ACK) == 0) { if (tp->t_state == TCPS_SYN_RECEIVED || (tp->t_flags & TF_NEEDSYN)) goto step6; else goto drop; } /* * Ack processing. */ switch (tp->t_state) { /* * In SYN_RECEIVED state, the ack ACKs our SYN, so enter * ESTABLISHED state and continue processing. * The ACK was checked above. */ case TCPS_SYN_RECEIVED: tcpstat.tcps_connects++; soisconnected(so); /* Do window scaling? */ if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == (TF_RCVD_SCALE|TF_REQ_SCALE)) { tp->snd_scale = tp->requested_s_scale; tp->rcv_scale = tp->request_r_scale; } /* * Upon successful completion of 3-way handshake, * update cache.CC, pass any queued data to the user, * and advance state appropriately. */ if (tcp_do_rfc1644) { tao.tao_cc = tp->cc_recv; tcp_hc_updatetao(&inp->inp_inc, TCP_HC_TAO_CC, tp->cc_recv, 0); } /* * Make transitions: * SYN-RECEIVED -> ESTABLISHED * SYN-RECEIVED* -> FIN-WAIT-1 */ tp->t_starttime = ticks; if (tp->t_flags & TF_NEEDFIN) { tp->t_state = TCPS_FIN_WAIT_1; tp->t_flags &= ~TF_NEEDFIN; } else { tp->t_state = TCPS_ESTABLISHED; callout_reset(tp->tt_keep, tcp_keepidle, tcp_timer_keep, tp); } /* * If segment contains data or ACK, will call tcp_reass() * later; if not, do so now to pass queued data to user. */ if (tlen == 0 && (thflags & TH_FIN) == 0) (void) tcp_reass(tp, (struct tcphdr *)0, 0, (struct mbuf *)0); tp->snd_wl1 = th->th_seq - 1; /* FALLTHROUGH */ /* * In ESTABLISHED state: drop duplicate ACKs; ACK out of range * ACKs. If the ack is in the range * tp->snd_una < th->th_ack <= tp->snd_max * then advance tp->snd_una to th->th_ack and drop * data from the retransmission queue. If this ACK reflects * more up to date window information we update our window information. */ case TCPS_ESTABLISHED: case TCPS_FIN_WAIT_1: case TCPS_FIN_WAIT_2: case TCPS_CLOSE_WAIT: case TCPS_CLOSING: case TCPS_LAST_ACK: case TCPS_TIME_WAIT: KASSERT(tp->t_state != TCPS_TIME_WAIT, ("timewait")); if (SEQ_LEQ(th->th_ack, tp->snd_una)) { if (tlen == 0 && tiwin == tp->snd_wnd) { tcpstat.tcps_rcvdupack++; /* * If we have outstanding data (other than * a window probe), this is a completely * duplicate ack (ie, window info didn't * change), the ack is the biggest we've * seen and we've seen exactly our rexmt * threshhold of them, assume a packet * has been dropped and retransmit it. * Kludge snd_nxt & the congestion * window so we send only this one * packet. * * We know we're losing at the current * window size so do congestion avoidance * (set ssthresh to half the current window * and pull our congestion window back to * the new ssthresh). * * Dup acks mean that packets have left the * network (they're now cached at the receiver) * so bump cwnd by the amount in the receiver * to keep a constant cwnd packets in the * network. */ if (!callout_active(tp->tt_rexmt) || th->th_ack != tp->snd_una) tp->t_dupacks = 0; else if (++tp->t_dupacks > tcprexmtthresh || ((tcp_do_newreno || tp->sack_enable) && IN_FASTRECOVERY(tp))) { tp->snd_cwnd += tp->t_maxseg; (void) tcp_output(tp); goto drop; } else if (tp->t_dupacks == tcprexmtthresh) { tcp_seq onxt = tp->snd_nxt; u_int win; if ((tcp_do_newreno || tp->sack_enable) && SEQ_LEQ(th->th_ack, tp->snd_recover)) { tp->t_dupacks = 0; break; } win = min(tp->snd_wnd, tp->snd_cwnd) / 2 / tp->t_maxseg; if (win < 2) win = 2; tp->snd_ssthresh = win * tp->t_maxseg; ENTER_FASTRECOVERY(tp); tp->snd_recover = tp->snd_max; callout_stop(tp->tt_rexmt); tp->t_rtttime = 0; if (tp->sack_enable) { tcpstat.tcps_sack_recovery_episode++; tp->snd_cwnd = tp->t_maxseg * tp->t_dupacks; (void) tcp_output(tp); tp->snd_cwnd = tp->snd_ssthresh; goto drop; } tp->snd_nxt = th->th_ack; tp->snd_cwnd = tp->t_maxseg; (void) tcp_output(tp); KASSERT(tp->snd_limited <= 2, ("tp->snd_limited too big")); tp->snd_cwnd = tp->snd_ssthresh + tp->t_maxseg * (tp->t_dupacks - tp->snd_limited); if (SEQ_GT(onxt, tp->snd_nxt)) tp->snd_nxt = onxt; goto drop; } else if (tcp_do_rfc3042) { u_long oldcwnd = tp->snd_cwnd; tcp_seq oldsndmax = tp->snd_max; u_int sent; KASSERT(tp->t_dupacks == 1 || tp->t_dupacks == 2, ("dupacks not 1 or 2")); if (tp->t_dupacks == 1) tp->snd_limited = 0; tp->snd_cwnd = (tp->snd_nxt - tp->snd_una) + (tp->t_dupacks - tp->snd_limited) * tp->t_maxseg; (void) tcp_output(tp); sent = tp->snd_max - oldsndmax; if (sent > tp->t_maxseg) { KASSERT((tp->t_dupacks == 2 && tp->snd_limited == 0) || (sent == tp->t_maxseg + 1 && tp->t_flags & TF_SENTFIN), ("sent too much")); tp->snd_limited = 2; } else if (sent > 0) ++tp->snd_limited; tp->snd_cwnd = oldcwnd; goto drop; } } else tp->t_dupacks = 0; break; } KASSERT(SEQ_GT(th->th_ack, tp->snd_una), ("th_ack <= snd_una")); /* * If the congestion window was inflated to account * for the other side's cached packets, retract it. */ if (tcp_do_newreno || tp->sack_enable) { if (IN_FASTRECOVERY(tp)) { if (SEQ_LT(th->th_ack, tp->snd_recover)) { if (tp->sack_enable) tcp_sack_partialack(tp, th); else tcp_newreno_partial_ack(tp, th); } else { /* * Out of fast recovery. * Window inflation should have left us * with approximately snd_ssthresh * outstanding data. * But in case we would be inclined to * send a burst, better to do it via * the slow start mechanism. */ if (SEQ_GT(th->th_ack + tp->snd_ssthresh, tp->snd_max)) tp->snd_cwnd = tp->snd_max - th->th_ack + tp->t_maxseg; else tp->snd_cwnd = tp->snd_ssthresh; } } } else { if (tp->t_dupacks >= tcprexmtthresh && tp->snd_cwnd > tp->snd_ssthresh) tp->snd_cwnd = tp->snd_ssthresh; } tp->t_dupacks = 0; if (SEQ_GT(th->th_ack, tp->snd_max)) { tcpstat.tcps_rcvacktoomuch++; goto dropafterack; } /* * If we reach this point, ACK is not a duplicate, * i.e., it ACKs something we sent. */ if (tp->t_flags & TF_NEEDSYN) { /* * T/TCP: Connection was half-synchronized, and our * SYN has been ACK'd (so connection is now fully * synchronized). Go to non-starred state, * increment snd_una for ACK of SYN, and check if * we can do window scaling. */ tp->t_flags &= ~TF_NEEDSYN; tp->snd_una++; /* Do window scaling? */ if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == (TF_RCVD_SCALE|TF_REQ_SCALE)) { tp->snd_scale = tp->requested_s_scale; tp->rcv_scale = tp->request_r_scale; } } process_ACK: acked = th->th_ack - tp->snd_una; tcpstat.tcps_rcvackpack++; tcpstat.tcps_rcvackbyte += acked; /* * If we just performed our first retransmit, and the ACK * arrives within our recovery window, then it was a mistake * to do the retransmit in the first place. Recover our * original cwnd and ssthresh, and proceed to transmit where * we left off. */ if (tp->t_rxtshift == 1 && ticks < tp->t_badrxtwin) { ++tcpstat.tcps_sndrexmitbad; tp->snd_cwnd = tp->snd_cwnd_prev; tp->snd_ssthresh = tp->snd_ssthresh_prev; tp->snd_recover = tp->snd_recover_prev; if (tp->t_flags & TF_WASFRECOVERY) ENTER_FASTRECOVERY(tp); tp->snd_nxt = tp->snd_max; tp->t_badrxtwin = 0; /* XXX probably not required */ } /* * If we have a timestamp reply, update smoothed * round trip time. If no timestamp is present but * transmit timer is running and timed sequence * number was acked, update smoothed round trip time. * Since we now have an rtt measurement, cancel the * timer backoff (cf., Phil Karn's retransmit alg.). * Recompute the initial retransmit timer. * * Some boxes send broken timestamp replies * during the SYN+ACK phase, ignore * timestamps of 0 or we could calculate a * huge RTT and blow up the retransmit timer. */ if ((to.to_flags & TOF_TS) != 0 && to.to_tsecr) { tcp_xmit_timer(tp, ticks - to.to_tsecr + 1); } else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) { tcp_xmit_timer(tp, ticks - tp->t_rtttime); } tcp_xmit_bandwidth_limit(tp, th->th_ack); /* * If all outstanding data is acked, stop retransmit * timer and remember to restart (more output or persist). * If there is more data to be acked, restart retransmit * timer, using current (possibly backed-off) value. */ if (th->th_ack == tp->snd_max) { callout_stop(tp->tt_rexmt); needoutput = 1; } else if (!callout_active(tp->tt_persist)) callout_reset(tp->tt_rexmt, tp->t_rxtcur, tcp_timer_rexmt, tp); /* * If no data (only SYN) was ACK'd, * skip rest of ACK processing. */ if (acked == 0) goto step6; /* * When new data is acked, open the congestion window. * If the window gives us less than ssthresh packets * in flight, open exponentially (maxseg per packet). * Otherwise open linearly: maxseg per window * (maxseg^2 / cwnd per packet). */ if ((!tcp_do_newreno && !tp->sack_enable) || !IN_FASTRECOVERY(tp)) { register u_int cw = tp->snd_cwnd; register u_int incr = tp->t_maxseg; if (cw > tp->snd_ssthresh) incr = incr * incr / cw; tp->snd_cwnd = min(cw+incr, TCP_MAXWIN<snd_scale); } SOCKBUF_LOCK(&so->so_snd); if (acked > so->so_snd.sb_cc) { tp->snd_wnd -= so->so_snd.sb_cc; sbdrop_locked(&so->so_snd, (int)so->so_snd.sb_cc); ourfinisacked = 1; } else { sbdrop_locked(&so->so_snd, acked); tp->snd_wnd -= acked; ourfinisacked = 0; } - SOCKBUF_UNLOCK(&so->so_snd); - sowwakeup(so); + sowwakeup_locked(so); /* detect una wraparound */ if ((tcp_do_newreno || tp->sack_enable) && !IN_FASTRECOVERY(tp) && SEQ_GT(tp->snd_una, tp->snd_recover) && SEQ_LEQ(th->th_ack, tp->snd_recover)) tp->snd_recover = th->th_ack - 1; if ((tcp_do_newreno || tp->sack_enable) && IN_FASTRECOVERY(tp) && SEQ_GEQ(th->th_ack, tp->snd_recover)) EXIT_FASTRECOVERY(tp); tp->snd_una = th->th_ack; if (tp->sack_enable) { if (SEQ_GT(tp->snd_una, tp->snd_recover)) tp->snd_recover = tp->snd_una; } if (SEQ_LT(tp->snd_nxt, tp->snd_una)) tp->snd_nxt = tp->snd_una; switch (tp->t_state) { /* * In FIN_WAIT_1 STATE in addition to the processing * for the ESTABLISHED state if our FIN is now acknowledged * then enter FIN_WAIT_2. */ case TCPS_FIN_WAIT_1: if (ourfinisacked) { /* * If we can't receive any more * data, then closing user can proceed. * Starting the timer is contrary to the * specification, but if we don't get a FIN * we'll hang forever. */ /* XXXjl * we should release the tp also, and use a * compressed state. */ if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { soisdisconnected(so); callout_reset(tp->tt_2msl, tcp_maxidle, tcp_timer_2msl, tp); } tp->t_state = TCPS_FIN_WAIT_2; } break; /* * In CLOSING STATE in addition to the processing for * the ESTABLISHED state if the ACK acknowledges our FIN * then enter the TIME-WAIT state, otherwise ignore * the segment. */ case TCPS_CLOSING: if (ourfinisacked) { KASSERT(headlocked, ("headlocked")); tcp_twstart(tp); INP_INFO_WUNLOCK(&tcbinfo); m_freem(m); return; } break; /* * In LAST_ACK, we may still be waiting for data to drain * and/or to be acked, as well as for the ack of our FIN. * If our FIN is now acknowledged, delete the TCB, * enter the closed state and return. */ case TCPS_LAST_ACK: if (ourfinisacked) { tp = tcp_close(tp); goto drop; } break; /* * In TIME_WAIT state the only thing that should arrive * is a retransmission of the remote FIN. Acknowledge * it and restart the finack timer. */ case TCPS_TIME_WAIT: KASSERT(tp->t_state != TCPS_TIME_WAIT, ("timewait")); callout_reset(tp->tt_2msl, 2 * tcp_msl, tcp_timer_2msl, tp); goto dropafterack; } } step6: /* * Update window information. * Don't look at window if no ACK: TAC's send garbage on first SYN. */ if ((thflags & TH_ACK) && (SEQ_LT(tp->snd_wl1, th->th_seq) || (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) || (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) { /* keep track of pure window updates */ if (tlen == 0 && tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) tcpstat.tcps_rcvwinupd++; tp->snd_wnd = tiwin; tp->snd_wl1 = th->th_seq; tp->snd_wl2 = th->th_ack; if (tp->snd_wnd > tp->max_sndwnd) tp->max_sndwnd = tp->snd_wnd; needoutput = 1; } /* * Process segments with URG. */ if ((thflags & TH_URG) && th->th_urp && TCPS_HAVERCVDFIN(tp->t_state) == 0) { /* * This is a kludge, but if we receive and accept * random urgent pointers, we'll crash in * soreceive. It's hard to imagine someone * actually wanting to send this much urgent data. */ if (th->th_urp + so->so_rcv.sb_cc > sb_max) { th->th_urp = 0; /* XXX */ thflags &= ~TH_URG; /* XXX */ goto dodata; /* XXX */ } /* * If this segment advances the known urgent pointer, * then mark the data stream. This should not happen * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since * a FIN has been received from the remote side. * In these states we ignore the URG. * * According to RFC961 (Assigned Protocols), * the urgent pointer points to the last octet * of urgent data. We continue, however, * to consider it to indicate the first octet * of data past the urgent section as the original * spec states (in one of two places). */ if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) { tp->rcv_up = th->th_seq + th->th_urp; SOCKBUF_LOCK(&so->so_rcv); so->so_oobmark = so->so_rcv.sb_cc + (tp->rcv_up - tp->rcv_nxt) - 1; if (so->so_oobmark == 0) so->so_rcv.sb_state |= SBS_RCVATMARK; SOCKBUF_UNLOCK(&so->so_rcv); sohasoutofband(so); tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA); } /* * Remove out of band data so doesn't get presented to user. * This can happen independent of advancing the URG pointer, * but if two URG's are pending at once, some out-of-band * data may creep in... ick. */ if (th->th_urp <= (u_long)tlen && !(so->so_options & SO_OOBINLINE)) { /* hdr drop is delayed */ tcp_pulloutofband(so, th, m, drop_hdrlen); } } else { /* * If no out of band data is expected, * pull receive urgent pointer along * with the receive window. */ if (SEQ_GT(tp->rcv_nxt, tp->rcv_up)) tp->rcv_up = tp->rcv_nxt; } dodata: /* XXX */ KASSERT(headlocked, ("headlocked")); /* * Process the segment text, merging it into the TCP sequencing queue, * and arranging for acknowledgment of receipt if necessary. * This process logically involves adjusting tp->rcv_wnd as data * is presented to the user (this happens in tcp_usrreq.c, * case PRU_RCVD). If a FIN has already been received on this * connection then we just ignore the text. */ if ((tlen || (thflags & TH_FIN)) && TCPS_HAVERCVDFIN(tp->t_state) == 0) { m_adj(m, drop_hdrlen); /* delayed header drop */ /* * Insert segment which includes th into TCP reassembly queue * with control block tp. Set thflags to whether reassembly now * includes a segment with FIN. This handles the common case * inline (segment is the next to be received on an established * connection, and the queue is empty), avoiding linkage into * and removal from the queue and repetition of various * conversions. * Set DELACK for segments received in order, but ack * immediately when segments are out of order (so * fast retransmit can work). */ if (th->th_seq == tp->rcv_nxt && LIST_EMPTY(&tp->t_segq) && TCPS_HAVEESTABLISHED(tp->t_state)) { if (DELAY_ACK(tp)) tp->t_flags |= TF_DELACK; else tp->t_flags |= TF_ACKNOW; tp->rcv_nxt += tlen; thflags = th->th_flags & TH_FIN; tcpstat.tcps_rcvpack++; tcpstat.tcps_rcvbyte += tlen; ND6_HINT(tp); + /* Unlocked read. */ + SOCKBUF_LOCK(&so->so_rcv); if (so->so_rcv.sb_state & SBS_CANTRCVMORE) m_freem(m); else - sbappendstream(&so->so_rcv, m); - sorwakeup(so); + sbappendstream_locked(&so->so_rcv, m); + sorwakeup_locked(so); } else { thflags = tcp_reass(tp, th, &tlen, m); tp->t_flags |= TF_ACKNOW; } if (tp->sack_enable) tcp_update_sack_list(tp); /* * Note the amount of data that peer has sent into * our window, in order to estimate the sender's * buffer size. */ len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt); } else { m_freem(m); thflags &= ~TH_FIN; } /* * If FIN is received ACK the FIN and let the user know * that the connection is closing. */ if (thflags & TH_FIN) { if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { socantrcvmore(so); /* * If connection is half-synchronized * (ie NEEDSYN flag on) then delay ACK, * so it may be piggybacked when SYN is sent. * Otherwise, since we received a FIN then no * more input can be expected, send ACK now. */ if (tp->t_flags & TF_NEEDSYN) tp->t_flags |= TF_DELACK; else tp->t_flags |= TF_ACKNOW; tp->rcv_nxt++; } switch (tp->t_state) { /* * In SYN_RECEIVED and ESTABLISHED STATES * enter the CLOSE_WAIT state. */ case TCPS_SYN_RECEIVED: tp->t_starttime = ticks; /*FALLTHROUGH*/ case TCPS_ESTABLISHED: tp->t_state = TCPS_CLOSE_WAIT; break; /* * If still in FIN_WAIT_1 STATE FIN has not been acked so * enter the CLOSING state. */ case TCPS_FIN_WAIT_1: tp->t_state = TCPS_CLOSING; break; /* * In FIN_WAIT_2 state enter the TIME_WAIT state, * starting the time-wait timer, turning off the other * standard timers. */ case TCPS_FIN_WAIT_2: KASSERT(headlocked == 1, ("headlocked should be 1")); tcp_twstart(tp); INP_INFO_WUNLOCK(&tcbinfo); return; /* * In TIME_WAIT state restart the 2 MSL time_wait timer. */ case TCPS_TIME_WAIT: KASSERT(tp->t_state != TCPS_TIME_WAIT, ("timewait")); callout_reset(tp->tt_2msl, 2 * tcp_msl, tcp_timer_2msl, tp); break; } } INP_INFO_WUNLOCK(&tcbinfo); #ifdef TCPDEBUG if (so->so_options & SO_DEBUG) tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif /* * Return any desired output. */ if (needoutput || (tp->t_flags & TF_ACKNOW)) (void) tcp_output(tp); check_delack: if (tp->t_flags & TF_DELACK) { tp->t_flags &= ~TF_DELACK; callout_reset(tp->tt_delack, tcp_delacktime, tcp_timer_delack, tp); } INP_UNLOCK(inp); return; dropafterack: /* * Generate an ACK dropping incoming segment if it occupies * sequence space, where the ACK reflects our state. * * We can now skip the test for the RST flag since all * paths to this code happen after packets containing * RST have been dropped. * * In the SYN-RECEIVED state, don't send an ACK unless the * segment we received passes the SYN-RECEIVED ACK test. * If it fails send a RST. This breaks the loop in the * "LAND" DoS attack, and also prevents an ACK storm * between two listening ports that have been sent forged * SYN segments, each with the source address of the other. */ if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) && (SEQ_GT(tp->snd_una, th->th_ack) || SEQ_GT(th->th_ack, tp->snd_max)) ) { rstreason = BANDLIM_RST_OPENPORT; goto dropwithreset; } #ifdef TCPDEBUG if (so->so_options & SO_DEBUG) tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif KASSERT(headlocked, ("headlocked should be 1")); INP_INFO_WUNLOCK(&tcbinfo); m_freem(m); tp->t_flags |= TF_ACKNOW; (void) tcp_output(tp); INP_UNLOCK(inp); return; dropwithreset: /* * Generate a RST, dropping incoming segment. * Make ACK acceptable to originator of segment. * Don't bother to respond if destination was broadcast/multicast. */ if ((thflags & TH_RST) || m->m_flags & (M_BCAST|M_MCAST)) goto drop; if (isipv6) { if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) || IN6_IS_ADDR_MULTICAST(&ip6->ip6_src)) goto drop; } else { if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) || IN_MULTICAST(ntohl(ip->ip_src.s_addr)) || ip->ip_src.s_addr == htonl(INADDR_BROADCAST) || in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) goto drop; } /* IPv6 anycast check is done at tcp6_input() */ /* * Perform bandwidth limiting. */ if (badport_bandlim(rstreason) < 0) goto drop; #ifdef TCPDEBUG if (tp == 0 || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif if (thflags & TH_ACK) /* mtod() below is safe as long as hdr dropping is delayed */ tcp_respond(tp, mtod(m, void *), th, m, (tcp_seq)0, th->th_ack, TH_RST); else { if (thflags & TH_SYN) tlen++; /* mtod() below is safe as long as hdr dropping is delayed */ tcp_respond(tp, mtod(m, void *), th, m, th->th_seq+tlen, (tcp_seq)0, TH_RST|TH_ACK); } if (tp) INP_UNLOCK(inp); if (headlocked) INP_INFO_WUNLOCK(&tcbinfo); return; drop: /* * Drop space held by incoming segment and return. */ #ifdef TCPDEBUG if (tp == 0 || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif if (tp) INP_UNLOCK(inp); m_freem(m); if (headlocked) INP_INFO_WUNLOCK(&tcbinfo); return; } /* * Parse TCP options and place in tcpopt. */ static void tcp_dooptions(tp, to, cp, cnt, is_syn, th) struct tcpcb *tp; struct tcpopt *to; u_char *cp; int cnt; int is_syn; struct tcphdr *th; { int opt, optlen; to->to_flags = 0; for (; cnt > 0; cnt -= optlen, cp += optlen) { opt = cp[0]; if (opt == TCPOPT_EOL) break; if (opt == TCPOPT_NOP) optlen = 1; else { if (cnt < 2) break; optlen = cp[1]; if (optlen < 2 || optlen > cnt) break; } switch (opt) { case TCPOPT_MAXSEG: if (optlen != TCPOLEN_MAXSEG) continue; if (!is_syn) continue; to->to_flags |= TOF_MSS; bcopy((char *)cp + 2, (char *)&to->to_mss, sizeof(to->to_mss)); to->to_mss = ntohs(to->to_mss); break; case TCPOPT_WINDOW: if (optlen != TCPOLEN_WINDOW) continue; if (! is_syn) continue; to->to_flags |= TOF_SCALE; to->to_requested_s_scale = min(cp[2], TCP_MAX_WINSHIFT); break; case TCPOPT_TIMESTAMP: if (optlen != TCPOLEN_TIMESTAMP) continue; to->to_flags |= TOF_TS; bcopy((char *)cp + 2, (char *)&to->to_tsval, sizeof(to->to_tsval)); to->to_tsval = ntohl(to->to_tsval); bcopy((char *)cp + 6, (char *)&to->to_tsecr, sizeof(to->to_tsecr)); to->to_tsecr = ntohl(to->to_tsecr); break; case TCPOPT_CC: if (optlen != TCPOLEN_CC) continue; to->to_flags |= TOF_CC; bcopy((char *)cp + 2, (char *)&to->to_cc, sizeof(to->to_cc)); to->to_cc = ntohl(to->to_cc); break; case TCPOPT_CCNEW: if (optlen != TCPOLEN_CC) continue; if (!is_syn) continue; to->to_flags |= TOF_CCNEW; bcopy((char *)cp + 2, (char *)&to->to_cc, sizeof(to->to_cc)); to->to_cc = ntohl(to->to_cc); break; case TCPOPT_CCECHO: if (optlen != TCPOLEN_CC) continue; if (!is_syn) continue; to->to_flags |= TOF_CCECHO; bcopy((char *)cp + 2, (char *)&to->to_ccecho, sizeof(to->to_ccecho)); to->to_ccecho = ntohl(to->to_ccecho); break; #ifdef TCP_SIGNATURE /* * XXX In order to reply to a host which has set the * TCP_SIGNATURE option in its initial SYN, we have to * record the fact that the option was observed here * for the syncache code to perform the correct response. */ case TCPOPT_SIGNATURE: if (optlen != TCPOLEN_SIGNATURE) continue; to->to_flags |= (TOF_SIGNATURE | TOF_SIGLEN); break; #endif case TCPOPT_SACK_PERMITTED: if (!tcp_do_sack || optlen != TCPOLEN_SACK_PERMITTED) continue; if (is_syn) { /* MUST only be set on SYN */ to->to_flags |= TOF_SACK; } break; case TCPOPT_SACK: if (!tp || tcp_sack_option(tp, th, cp, optlen)) continue; break; default: continue; } } } /* * Pull out of band byte out of a segment so * it doesn't appear in the user's data queue. * It is still reflected in the segment length for * sequencing purposes. */ static void tcp_pulloutofband(so, th, m, off) struct socket *so; struct tcphdr *th; register struct mbuf *m; int off; /* delayed to be droped hdrlen */ { int cnt = off + th->th_urp - 1; while (cnt >= 0) { if (m->m_len > cnt) { char *cp = mtod(m, caddr_t) + cnt; struct tcpcb *tp = sototcpcb(so); tp->t_iobc = *cp; tp->t_oobflags |= TCPOOB_HAVEDATA; bcopy(cp+1, cp, (unsigned)(m->m_len - cnt - 1)); m->m_len--; if (m->m_flags & M_PKTHDR) m->m_pkthdr.len--; return; } cnt -= m->m_len; m = m->m_next; if (m == 0) break; } panic("tcp_pulloutofband"); } /* * Collect new round-trip time estimate * and update averages and current timeout. */ static void tcp_xmit_timer(tp, rtt) register struct tcpcb *tp; int rtt; { register int delta; tcpstat.tcps_rttupdated++; tp->t_rttupdated++; if (tp->t_srtt != 0) { /* * srtt is stored as fixed point with 5 bits after the * binary point (i.e., scaled by 8). The following magic * is equivalent to the smoothing algorithm in rfc793 with * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed * point). Adjust rtt to origin 0. */ delta = ((rtt - 1) << TCP_DELTA_SHIFT) - (tp->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT)); if ((tp->t_srtt += delta) <= 0) tp->t_srtt = 1; /* * We accumulate a smoothed rtt variance (actually, a * smoothed mean difference), then set the retransmit * timer to smoothed rtt + 4 times the smoothed variance. * rttvar is stored as fixed point with 4 bits after the * binary point (scaled by 16). The following is * equivalent to rfc793 smoothing with an alpha of .75 * (rttvar = rttvar*3/4 + |delta| / 4). This replaces * rfc793's wired-in beta. */ if (delta < 0) delta = -delta; delta -= tp->t_rttvar >> (TCP_RTTVAR_SHIFT - TCP_DELTA_SHIFT); if ((tp->t_rttvar += delta) <= 0) tp->t_rttvar = 1; if (tp->t_rttbest > tp->t_srtt + tp->t_rttvar) tp->t_rttbest = tp->t_srtt + tp->t_rttvar; } else { /* * No rtt measurement yet - use the unsmoothed rtt. * Set the variance to half the rtt (so our first * retransmit happens at 3*rtt). */ tp->t_srtt = rtt << TCP_RTT_SHIFT; tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - 1); tp->t_rttbest = tp->t_srtt + tp->t_rttvar; } tp->t_rtttime = 0; tp->t_rxtshift = 0; /* * the retransmit should happen at rtt + 4 * rttvar. * Because of the way we do the smoothing, srtt and rttvar * will each average +1/2 tick of bias. When we compute * the retransmit timer, we want 1/2 tick of rounding and * 1 extra tick because of +-1/2 tick uncertainty in the * firing of the timer. The bias will give us exactly the * 1.5 tick we need. But, because the bias is * statistical, we have to test that we don't drop below * the minimum feasible timer (which is 2 ticks). */ TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp), max(tp->t_rttmin, rtt + 2), TCPTV_REXMTMAX); /* * We received an ack for a packet that wasn't retransmitted; * it is probably safe to discard any error indications we've * received recently. This isn't quite right, but close enough * for now (a route might have failed after we sent a segment, * and the return path might not be symmetrical). */ tp->t_softerror = 0; } /* * Determine a reasonable value for maxseg size. * If the route is known, check route for mtu. * If none, use an mss that can be handled on the outgoing * interface without forcing IP to fragment; if bigger than * an mbuf cluster (MCLBYTES), round down to nearest multiple of MCLBYTES * to utilize large mbufs. If no route is found, route has no mtu, * or the destination isn't local, use a default, hopefully conservative * size (usually 512 or the default IP max size, but no more than the mtu * of the interface), as we can't discover anything about intervening * gateways or networks. We also initialize the congestion/slow start * window to be a single segment if the destination isn't local. * While looking at the routing entry, we also initialize other path-dependent * parameters from pre-set or cached values in the routing entry. * * Also take into account the space needed for options that we * send regularly. Make maxseg shorter by that amount to assure * that we can send maxseg amount of data even when the options * are present. Store the upper limit of the length of options plus * data in maxopd. * * * In case of T/TCP, we call this routine during implicit connection * setup as well (offer = -1), to initialize maxseg from the cached * MSS of our peer. * * NOTE that this routine is only called when we process an incoming * segment. Outgoing SYN/ACK MSS settings are handled in tcp_mssopt(). */ void tcp_mss(tp, offer) struct tcpcb *tp; int offer; { int rtt, mss; u_long bufsize; u_long maxmtu; struct inpcb *inp = tp->t_inpcb; struct socket *so; struct hc_metrics_lite metrics; struct rmxp_tao tao; int origoffer = offer; #ifdef INET6 int isipv6 = ((inp->inp_vflag & INP_IPV6) != 0) ? 1 : 0; size_t min_protoh = isipv6 ? sizeof (struct ip6_hdr) + sizeof (struct tcphdr) : sizeof (struct tcpiphdr); #else const size_t min_protoh = sizeof(struct tcpiphdr); #endif bzero(&tao, sizeof(tao)); /* initialize */ #ifdef INET6 if (isipv6) { maxmtu = tcp_maxmtu6(&inp->inp_inc); tp->t_maxopd = tp->t_maxseg = tcp_v6mssdflt; } else #endif { maxmtu = tcp_maxmtu(&inp->inp_inc); tp->t_maxopd = tp->t_maxseg = tcp_mssdflt; } so = inp->inp_socket; /* * no route to sender, stay with default mss and return */ if (maxmtu == 0) return; /* what have we got? */ switch (offer) { case 0: /* * Offer == 0 means that there was no MSS on the SYN * segment, in this case we use tcp_mssdflt. */ offer = #ifdef INET6 isipv6 ? tcp_v6mssdflt : #endif tcp_mssdflt; break; case -1: /* * Offer == -1 means that we didn't receive SYN yet, * use cached value in that case; */ if (tcp_do_rfc1644) tcp_hc_gettao(&inp->inp_inc, &tao); if (tao.tao_mssopt != 0) offer = tao.tao_mssopt; /* FALLTHROUGH */ default: /* * Prevent DoS attack with too small MSS. Round up * to at least minmss. */ offer = max(offer, tcp_minmss); /* * Sanity check: make sure that maxopd will be large * enough to allow some data on segments even if the * all the option space is used (40bytes). Otherwise * funny things may happen in tcp_output. */ offer = max(offer, 64); if (tcp_do_rfc1644) tcp_hc_updatetao(&inp->inp_inc, TCP_HC_TAO_MSSOPT, 0, offer); } /* * rmx information is now retrieved from tcp_hostcache */ tcp_hc_get(&inp->inp_inc, &metrics); /* * if there's a discovered mtu int tcp hostcache, use it * else, use the link mtu. */ if (metrics.rmx_mtu) mss = min(metrics.rmx_mtu, maxmtu) - min_protoh; else { #ifdef INET6 if (isipv6) { mss = maxmtu - min_protoh; if (!path_mtu_discovery && !in6_localaddr(&inp->in6p_faddr)) mss = min(mss, tcp_v6mssdflt); } else #endif { mss = maxmtu - min_protoh; if (!path_mtu_discovery && !in_localaddr(inp->inp_faddr)) mss = min(mss, tcp_mssdflt); } } mss = min(mss, offer); /* * maxopd stores the maximum length of data AND options * in a segment; maxseg is the amount of data in a normal * segment. We need to store this value (maxopd) apart * from maxseg, because now every segment carries options * and thus we normally have somewhat less data in segments. */ tp->t_maxopd = mss; /* * In case of T/TCP, origoffer==-1 indicates, that no segments * were received yet. In this case we just guess, otherwise * we do the same as before T/TCP. */ if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && (origoffer == -1 || (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP)) mss -= TCPOLEN_TSTAMP_APPA; if ((tp->t_flags & (TF_REQ_CC|TF_NOOPT)) == TF_REQ_CC && (origoffer == -1 || (tp->t_flags & TF_RCVD_CC) == TF_RCVD_CC)) mss -= TCPOLEN_CC_APPA; tp->t_maxseg = mss; #if (MCLBYTES & (MCLBYTES - 1)) == 0 if (mss > MCLBYTES) mss &= ~(MCLBYTES-1); #else if (mss > MCLBYTES) mss = mss / MCLBYTES * MCLBYTES; #endif tp->t_maxseg = mss; /* * If there's a pipesize, change the socket buffer to that size, * don't change if sb_hiwat is different than default (then it * has been changed on purpose with setsockopt). * Make the socket buffers an integral number of mss units; * if the mss is larger than the socket buffer, decrease the mss. */ SOCKBUF_LOCK(&so->so_snd); if ((so->so_snd.sb_hiwat == tcp_sendspace) && metrics.rmx_sendpipe) bufsize = metrics.rmx_sendpipe; else bufsize = so->so_snd.sb_hiwat; if (bufsize < mss) mss = bufsize; else { bufsize = roundup(bufsize, mss); if (bufsize > sb_max) bufsize = sb_max; if (bufsize > so->so_snd.sb_hiwat) (void)sbreserve_locked(&so->so_snd, bufsize, so, NULL); } SOCKBUF_UNLOCK(&so->so_snd); tp->t_maxseg = mss; SOCKBUF_LOCK(&so->so_rcv); if ((so->so_rcv.sb_hiwat == tcp_recvspace) && metrics.rmx_recvpipe) bufsize = metrics.rmx_recvpipe; else bufsize = so->so_rcv.sb_hiwat; if (bufsize > mss) { bufsize = roundup(bufsize, mss); if (bufsize > sb_max) bufsize = sb_max; if (bufsize > so->so_rcv.sb_hiwat) (void)sbreserve_locked(&so->so_rcv, bufsize, so, NULL); } SOCKBUF_UNLOCK(&so->so_rcv); /* * While we're here, check the others too */ if (tp->t_srtt == 0 && (rtt = metrics.rmx_rtt)) { tp->t_srtt = rtt; tp->t_rttbest = tp->t_srtt + TCP_RTT_SCALE; tcpstat.tcps_usedrtt++; if (metrics.rmx_rttvar) { tp->t_rttvar = metrics.rmx_rttvar; tcpstat.tcps_usedrttvar++; } else { /* default variation is +- 1 rtt */ tp->t_rttvar = tp->t_srtt * TCP_RTTVAR_SCALE / TCP_RTT_SCALE; } TCPT_RANGESET(tp->t_rxtcur, ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1, tp->t_rttmin, TCPTV_REXMTMAX); } if (metrics.rmx_ssthresh) { /* * There's some sort of gateway or interface * buffer limit on the path. Use this to set * the slow start threshhold, but set the * threshold to no less than 2*mss. */ tp->snd_ssthresh = max(2 * mss, metrics.rmx_ssthresh); tcpstat.tcps_usedssthresh++; } if (metrics.rmx_bandwidth) tp->snd_bandwidth = metrics.rmx_bandwidth; /* * Set the slow-start flight size depending on whether this * is a local network or not. * * Extend this so we cache the cwnd too and retrieve it here. * Make cwnd even bigger than RFC3390 suggests but only if we * have previous experience with the remote host. Be careful * not make cwnd bigger than remote receive window or our own * send socket buffer. Maybe put some additional upper bound * on the retrieved cwnd. Should do incremental updates to * hostcache when cwnd collapses so next connection doesn't * overloads the path again. * * RFC3390 says only do this if SYN or SYN/ACK didn't got lost. * We currently check only in syncache_socket for that. */ #define TCP_METRICS_CWND #ifdef TCP_METRICS_CWND if (metrics.rmx_cwnd) tp->snd_cwnd = max(mss, min(metrics.rmx_cwnd / 2, min(tp->snd_wnd, so->so_snd.sb_hiwat))); else #endif if (tcp_do_rfc3390) tp->snd_cwnd = min(4 * mss, max(2 * mss, 4380)); #ifdef INET6 else if ((isipv6 && in6_localaddr(&inp->in6p_faddr)) || (!isipv6 && in_localaddr(inp->inp_faddr))) #else else if (in_localaddr(inp->inp_faddr)) #endif tp->snd_cwnd = mss * ss_fltsz_local; else tp->snd_cwnd = mss * ss_fltsz; } /* * Determine the MSS option to send on an outgoing SYN. */ int tcp_mssopt(inc) struct in_conninfo *inc; { int mss = 0; u_long maxmtu = 0; u_long thcmtu = 0; size_t min_protoh; #ifdef INET6 int isipv6 = inc->inc_isipv6 ? 1 : 0; #endif KASSERT(inc != NULL, ("tcp_mssopt with NULL in_conninfo pointer")); #ifdef INET6 if (isipv6) { mss = tcp_v6mssdflt; maxmtu = tcp_maxmtu6(inc); thcmtu = tcp_hc_getmtu(inc); /* IPv4 and IPv6 */ min_protoh = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); } else #endif { mss = tcp_mssdflt; maxmtu = tcp_maxmtu(inc); thcmtu = tcp_hc_getmtu(inc); /* IPv4 and IPv6 */ min_protoh = sizeof(struct tcpiphdr); } if (maxmtu && thcmtu) mss = min(maxmtu, thcmtu) - min_protoh; else if (maxmtu || thcmtu) mss = max(maxmtu, thcmtu) - min_protoh; return (mss); } /* * On a partial ack arrives, force the retransmission of the * next unacknowledged segment. Do not clear tp->t_dupacks. * By setting snd_nxt to ti_ack, this forces retransmission timer to * be started again. */ static void tcp_newreno_partial_ack(tp, th) struct tcpcb *tp; struct tcphdr *th; { tcp_seq onxt = tp->snd_nxt; u_long ocwnd = tp->snd_cwnd; callout_stop(tp->tt_rexmt); tp->t_rtttime = 0; tp->snd_nxt = th->th_ack; /* * Set snd_cwnd to one segment beyond acknowledged offset. * (tp->snd_una has not yet been updated when this function is called.) */ tp->snd_cwnd = tp->t_maxseg + (th->th_ack - tp->snd_una); tp->t_flags |= TF_ACKNOW; (void) tcp_output(tp); tp->snd_cwnd = ocwnd; if (SEQ_GT(onxt, tp->snd_nxt)) tp->snd_nxt = onxt; /* * Partial window deflation. Relies on fact that tp->snd_una * not updated yet. */ tp->snd_cwnd -= (th->th_ack - tp->snd_una - tp->t_maxseg); } /* * Returns 1 if the TIME_WAIT state was killed and we should start over, * looking for a pcb in the listen state. Returns 0 otherwise. */ static int tcp_timewait(tw, to, th, m, tlen) struct tcptw *tw; struct tcpopt *to; struct tcphdr *th; struct mbuf *m; int tlen; { int thflags; tcp_seq seq; #ifdef INET6 int isipv6 = (mtod(m, struct ip *)->ip_v == 6) ? 1 : 0; #else const int isipv6 = 0; #endif thflags = th->th_flags; /* * NOTE: for FIN_WAIT_2 (to be added later), * must validate sequence number before accepting RST */ /* * If the segment contains RST: * Drop the segment - see Stevens, vol. 2, p. 964 and * RFC 1337. */ if (thflags & TH_RST) goto drop; /* * If segment contains a SYN and CC [not CC.NEW] option: * if connection duration > MSL, drop packet and send RST; * * if SEG.CC > CCrecv then is new SYN. * Complete close and delete TCPCB. Then reprocess * segment, hoping to find new TCPCB in LISTEN state; * * else must be old SYN; drop it. * else do normal processing. */ if ((thflags & TH_SYN) && (to->to_flags & TOF_CC) && tw->cc_recv != 0) { if ((ticks - tw->t_starttime) > tcp_msl) goto reset; if (CC_GT(to->to_cc, tw->cc_recv)) { (void) tcp_twclose(tw, 0); return (1); } goto drop; } #if 0 /* PAWS not needed at the moment */ /* * RFC 1323 PAWS: If we have a timestamp reply on this segment * and it's less than ts_recent, drop it. */ if ((to.to_flags & TOF_TS) != 0 && tp->ts_recent && TSTMP_LT(to.to_tsval, tp->ts_recent)) { if ((thflags & TH_ACK) == 0) goto drop; goto ack; } /* * ts_recent is never updated because we never accept new segments. */ #endif /* * If a new connection request is received * while in TIME_WAIT, drop the old connection * and start over if the sequence numbers * are above the previous ones. */ if ((thflags & TH_SYN) && SEQ_GT(th->th_seq, tw->rcv_nxt)) { (void) tcp_twclose(tw, 0); return (1); } /* * Drop the the segment if it does not contain an ACK. */ if ((thflags & TH_ACK) == 0) goto drop; /* * Reset the 2MSL timer if this is a duplicate FIN. */ if (thflags & TH_FIN) { seq = th->th_seq + tlen + (thflags & TH_SYN ? 1 : 0); if (seq + 1 == tw->rcv_nxt) tcp_timer_2msl_reset(tw, 2 * tcp_msl); } /* * Acknowledge the segment if it has data or is not a duplicate ACK. */ if (thflags != TH_ACK || tlen != 0 || th->th_seq != tw->rcv_nxt || th->th_ack != tw->snd_nxt) tcp_twrespond(tw, TH_ACK); goto drop; reset: /* * Generate a RST, dropping incoming segment. * Make ACK acceptable to originator of segment. * Don't bother to respond if destination was broadcast/multicast. */ if (m->m_flags & (M_BCAST|M_MCAST)) goto drop; if (isipv6) { struct ip6_hdr *ip6; /* IPv6 anycast check is done at tcp6_input() */ ip6 = mtod(m, struct ip6_hdr *); if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) || IN6_IS_ADDR_MULTICAST(&ip6->ip6_src)) goto drop; } else { struct ip *ip; ip = mtod(m, struct ip *); if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) || IN_MULTICAST(ntohl(ip->ip_src.s_addr)) || ip->ip_src.s_addr == htonl(INADDR_BROADCAST) || in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) goto drop; } if (thflags & TH_ACK) { tcp_respond(NULL, mtod(m, void *), th, m, 0, th->th_ack, TH_RST); } else { seq = th->th_seq + (thflags & TH_SYN ? 1 : 0); tcp_respond(NULL, mtod(m, void *), th, m, seq, 0, TH_RST|TH_ACK); } INP_UNLOCK(tw->tw_inpcb); return (0); drop: INP_UNLOCK(tw->tw_inpcb); m_freem(m); return (0); } Index: head/sys/netinet/udp_usrreq.c =================================================================== --- head/sys/netinet/udp_usrreq.c (revision 131150) +++ head/sys/netinet/udp_usrreq.c (revision 131151) @@ -1,1125 +1,1130 @@ /* * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)udp_usrreq.c 8.6 (Berkeley) 5/23/95 * $FreeBSD$ */ #include "opt_ipsec.h" #include "opt_inet6.h" #include "opt_mac.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #endif #include #include #include #ifdef INET6 #include #endif #include #include #ifdef FAST_IPSEC #include #endif /*FAST_IPSEC*/ #ifdef IPSEC #include #endif /*IPSEC*/ #include /* * UDP protocol implementation. * Per RFC 768, August, 1980. */ #ifndef COMPAT_42 static int udpcksum = 1; #else static int udpcksum = 0; /* XXX */ #endif SYSCTL_INT(_net_inet_udp, UDPCTL_CHECKSUM, checksum, CTLFLAG_RW, &udpcksum, 0, ""); int log_in_vain = 0; SYSCTL_INT(_net_inet_udp, OID_AUTO, log_in_vain, CTLFLAG_RW, &log_in_vain, 0, "Log all incoming UDP packets"); static int blackhole = 0; SYSCTL_INT(_net_inet_udp, OID_AUTO, blackhole, CTLFLAG_RW, &blackhole, 0, "Do not send port unreachables for refused connects"); static int strict_mcast_mship = 0; SYSCTL_INT(_net_inet_udp, OID_AUTO, strict_mcast_mship, CTLFLAG_RW, &strict_mcast_mship, 0, "Only send multicast to member sockets"); struct inpcbhead udb; /* from udp_var.h */ #define udb6 udb /* for KAME src sync over BSD*'s */ struct inpcbinfo udbinfo; #ifndef UDBHASHSIZE #define UDBHASHSIZE 16 #endif struct udpstat udpstat; /* from udp_var.h */ SYSCTL_STRUCT(_net_inet_udp, UDPCTL_STATS, stats, CTLFLAG_RW, &udpstat, udpstat, "UDP statistics (struct udpstat, netinet/udp_var.h)"); static struct sockaddr_in udp_in = { sizeof(udp_in), AF_INET }; #ifdef INET6 struct udp_in6 { struct sockaddr_in6 uin6_sin; u_char uin6_init_done : 1; } udp_in6 = { { sizeof(udp_in6.uin6_sin), AF_INET6 }, 0 }; struct udp_ip6 { struct ip6_hdr uip6_ip6; u_char uip6_init_done : 1; } udp_ip6; #endif /* INET6 */ static void udp_append(struct inpcb *last, struct ip *ip, struct mbuf *n, int off); #ifdef INET6 static void ip_2_ip6_hdr(struct ip6_hdr *ip6, struct ip *ip); #endif static int udp_detach(struct socket *so); static int udp_output(struct inpcb *, struct mbuf *, struct sockaddr *, struct mbuf *, struct thread *); void udp_init() { INP_INFO_LOCK_INIT(&udbinfo, "udp"); LIST_INIT(&udb); udbinfo.listhead = &udb; udbinfo.hashbase = hashinit(UDBHASHSIZE, M_PCB, &udbinfo.hashmask); udbinfo.porthashbase = hashinit(UDBHASHSIZE, M_PCB, &udbinfo.porthashmask); udbinfo.ipi_zone = uma_zcreate("udpcb", sizeof(struct inpcb), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); uma_zone_set_max(udbinfo.ipi_zone, maxsockets); } void udp_input(m, off) register struct mbuf *m; int off; { int iphlen = off; register struct ip *ip; register struct udphdr *uh; register struct inpcb *inp; struct mbuf *opts = 0; int len; struct ip save_ip; udpstat.udps_ipackets++; /* * Strip IP options, if any; should skip this, * make available to user, and use on returned packets, * but we don't yet have a way to check the checksum * with options still present. */ if (iphlen > sizeof (struct ip)) { ip_stripoptions(m, (struct mbuf *)0); iphlen = sizeof(struct ip); } /* * Get IP and UDP header together in first mbuf. */ ip = mtod(m, struct ip *); if (m->m_len < iphlen + sizeof(struct udphdr)) { if ((m = m_pullup(m, iphlen + sizeof(struct udphdr))) == 0) { udpstat.udps_hdrops++; return; } ip = mtod(m, struct ip *); } uh = (struct udphdr *)((caddr_t)ip + iphlen); /* destination port of 0 is illegal, based on RFC768. */ if (uh->uh_dport == 0) goto badunlocked; /* * Construct sockaddr format source address. * Stuff source address and datagram in user buffer. */ udp_in.sin_port = uh->uh_sport; udp_in.sin_addr = ip->ip_src; #ifdef INET6 udp_in6.uin6_init_done = udp_ip6.uip6_init_done = 0; #endif /* * Make mbuf data length reflect UDP length. * If not enough data to reflect UDP length, drop. */ len = ntohs((u_short)uh->uh_ulen); if (ip->ip_len != len) { if (len > ip->ip_len || len < sizeof(struct udphdr)) { udpstat.udps_badlen++; goto badunlocked; } m_adj(m, len - ip->ip_len); /* ip->ip_len = len; */ } /* * Save a copy of the IP header in case we want restore it * for sending an ICMP error message in response. */ if (!blackhole) save_ip = *ip; /* * Checksum extended UDP header and data. */ if (uh->uh_sum) { if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) { if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) uh->uh_sum = m->m_pkthdr.csum_data; else uh->uh_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htonl((u_short)len + m->m_pkthdr.csum_data + IPPROTO_UDP)); uh->uh_sum ^= 0xffff; } else { char b[9]; bcopy(((struct ipovly *)ip)->ih_x1, b, 9); bzero(((struct ipovly *)ip)->ih_x1, 9); ((struct ipovly *)ip)->ih_len = uh->uh_ulen; uh->uh_sum = in_cksum(m, len + sizeof (struct ip)); bcopy(b, ((struct ipovly *)ip)->ih_x1, 9); } if (uh->uh_sum) { udpstat.udps_badsum++; m_freem(m); return; } } else udpstat.udps_nosum++; INP_INFO_RLOCK(&udbinfo); if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) || in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) { struct inpcb *last; /* * Deliver a multicast or broadcast datagram to *all* sockets * for which the local and remote addresses and ports match * those of the incoming datagram. This allows more than * one process to receive multi/broadcasts on the same port. * (This really ought to be done for unicast datagrams as * well, but that would cause problems with existing * applications that open both address-specific sockets and * a wildcard socket listening to the same port -- they would * end up receiving duplicates of every unicast datagram. * Those applications open the multiple sockets to overcome an * inadequacy of the UDP socket interface, but for backwards * compatibility we avoid the problem here rather than * fixing the interface. Maybe 4.5BSD will remedy this?) */ /* * Locate pcb(s) for datagram. * (Algorithm copied from raw_intr().) */ last = NULL; LIST_FOREACH(inp, &udb, inp_list) { INP_LOCK(inp); if (inp->inp_lport != uh->uh_dport) { docontinue: INP_UNLOCK(inp); continue; } #ifdef INET6 if ((inp->inp_vflag & INP_IPV4) == 0) goto docontinue; #endif if (inp->inp_laddr.s_addr != INADDR_ANY) { if (inp->inp_laddr.s_addr != ip->ip_dst.s_addr) goto docontinue; } if (inp->inp_faddr.s_addr != INADDR_ANY) { if (inp->inp_faddr.s_addr != ip->ip_src.s_addr || inp->inp_fport != uh->uh_sport) goto docontinue; } /* * Check multicast packets to make sure they are only * sent to sockets with multicast memberships for the * packet's destination address and arrival interface */ #define MSHIP(_inp, n) ((_inp)->inp_moptions->imo_membership[(n)]) #define NMSHIPS(_inp) ((_inp)->inp_moptions->imo_num_memberships) if (strict_mcast_mship && inp->inp_moptions != NULL) { int mship, foundmship = 0; for (mship = 0; mship < NMSHIPS(inp); mship++) { if (MSHIP(inp, mship)->inm_addr.s_addr == ip->ip_dst.s_addr && MSHIP(inp, mship)->inm_ifp == m->m_pkthdr.rcvif) { foundmship = 1; break; } } if (foundmship == 0) goto docontinue; } #undef NMSHIPS #undef MSHIP if (last != NULL) { struct mbuf *n; n = m_copy(m, 0, M_COPYALL); if (n != NULL) udp_append(last, ip, n, iphlen + sizeof(struct udphdr)); INP_UNLOCK(last); } last = inp; /* * Don't look for additional matches if this one does * not have either the SO_REUSEPORT or SO_REUSEADDR * socket options set. This heuristic avoids searching * through all pcbs in the common case of a non-shared * port. It * assumes that an application will never * clear these options after setting them. */ if ((last->inp_socket->so_options&(SO_REUSEPORT|SO_REUSEADDR)) == 0) break; } if (last == NULL) { /* * No matching pcb found; discard datagram. * (No need to send an ICMP Port Unreachable * for a broadcast or multicast datgram.) */ udpstat.udps_noportbcast++; goto badheadlocked; } INP_INFO_RUNLOCK(&udbinfo); udp_append(last, ip, m, iphlen + sizeof(struct udphdr)); INP_UNLOCK(last); return; } /* * Locate pcb for datagram. */ inp = in_pcblookup_hash(&udbinfo, ip->ip_src, uh->uh_sport, ip->ip_dst, uh->uh_dport, 1, m->m_pkthdr.rcvif); if (inp == NULL) { if (log_in_vain) { char buf[4*sizeof "123"]; strcpy(buf, inet_ntoa(ip->ip_dst)); log(LOG_INFO, "Connection attempt to UDP %s:%d from %s:%d\n", buf, ntohs(uh->uh_dport), inet_ntoa(ip->ip_src), ntohs(uh->uh_sport)); } udpstat.udps_noport++; if (m->m_flags & (M_BCAST | M_MCAST)) { udpstat.udps_noportbcast++; goto badheadlocked; } if (blackhole) goto badheadlocked; if (badport_bandlim(BANDLIM_ICMP_UNREACH) < 0) goto badheadlocked; *ip = save_ip; ip->ip_len += iphlen; icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_PORT, 0, 0); INP_INFO_RUNLOCK(&udbinfo); return; } INP_LOCK(inp); INP_INFO_RUNLOCK(&udbinfo); udp_append(inp, ip, m, iphlen + sizeof(struct udphdr)); INP_UNLOCK(inp); return; badheadlocked: INP_INFO_RUNLOCK(&udbinfo); if (inp) INP_UNLOCK(inp); badunlocked: m_freem(m); if (opts) m_freem(opts); return; } #ifdef INET6 static void ip_2_ip6_hdr(ip6, ip) struct ip6_hdr *ip6; struct ip *ip; { bzero(ip6, sizeof(*ip6)); ip6->ip6_vfc = IPV6_VERSION; ip6->ip6_plen = ip->ip_len; ip6->ip6_nxt = ip->ip_p; ip6->ip6_hlim = ip->ip_ttl; ip6->ip6_src.s6_addr32[2] = ip6->ip6_dst.s6_addr32[2] = IPV6_ADDR_INT32_SMP; ip6->ip6_src.s6_addr32[3] = ip->ip_src.s_addr; ip6->ip6_dst.s6_addr32[3] = ip->ip_dst.s_addr; } #endif /* * subroutine of udp_input(), mainly for source code readability. * caller must properly init udp_ip6 and udp_in6 beforehand. */ static void udp_append(last, ip, n, off) struct inpcb *last; struct ip *ip; struct mbuf *n; int off; { struct sockaddr *append_sa; + struct socket *so; struct mbuf *opts = 0; INP_LOCK_ASSERT(last); #if defined(IPSEC) || defined(FAST_IPSEC) /* check AH/ESP integrity. */ if (ipsec4_in_reject(n, last)) { #ifdef IPSEC ipsecstat.in_polvio++; #endif /*IPSEC*/ m_freem(n); return; } #endif /*IPSEC || FAST_IPSEC*/ #ifdef MAC if (mac_check_inpcb_deliver(last, n) != 0) { m_freem(n); return; } #endif if (last->inp_flags & INP_CONTROLOPTS || last->inp_socket->so_options & (SO_TIMESTAMP | SO_BINTIME)) { #ifdef INET6 if (last->inp_vflag & INP_IPV6) { int savedflags; if (udp_ip6.uip6_init_done == 0) { ip_2_ip6_hdr(&udp_ip6.uip6_ip6, ip); udp_ip6.uip6_init_done = 1; } savedflags = last->inp_flags; last->inp_flags &= ~INP_UNMAPPABLEOPTS; ip6_savecontrol(last, n, &opts); last->inp_flags = savedflags; } else #endif ip_savecontrol(last, &opts, ip, n); } #ifdef INET6 if (last->inp_vflag & INP_IPV6) { if (udp_in6.uin6_init_done == 0) { in6_sin_2_v4mapsin6(&udp_in, &udp_in6.uin6_sin); udp_in6.uin6_init_done = 1; } append_sa = (struct sockaddr *)&udp_in6.uin6_sin; } else #endif append_sa = (struct sockaddr *)&udp_in; m_adj(n, off); - if (sbappendaddr(&last->inp_socket->so_rcv, append_sa, n, opts) == 0) { + + so = last->inp_socket; + SOCKBUF_LOCK(&so->so_rcv); + if (sbappendaddr_locked(&so->so_rcv, append_sa, n, opts) == 0) { m_freem(n); if (opts) m_freem(opts); udpstat.udps_fullsock++; + SOCKBUF_UNLOCK(&so->so_rcv); } else - sorwakeup(last->inp_socket); + sorwakeup_locked(so); } /* * Notify a udp user of an asynchronous error; * just wake up so that he can collect error status. */ struct inpcb * udp_notify(inp, errno) register struct inpcb *inp; int errno; { inp->inp_socket->so_error = errno; sorwakeup(inp->inp_socket); sowwakeup(inp->inp_socket); return inp; } void udp_ctlinput(cmd, sa, vip) int cmd; struct sockaddr *sa; void *vip; { struct ip *ip = vip; struct udphdr *uh; struct inpcb *(*notify)(struct inpcb *, int) = udp_notify; struct in_addr faddr; struct inpcb *inp; int s; faddr = ((struct sockaddr_in *)sa)->sin_addr; if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY) return; /* * Redirects don't need to be handled up here. */ if (PRC_IS_REDIRECT(cmd)) return; /* * Hostdead is ugly because it goes linearly through all PCBs. * XXX: We never get this from ICMP, otherwise it makes an * excellent DoS attack on machines with many connections. */ if (cmd == PRC_HOSTDEAD) ip = 0; else if ((unsigned)cmd >= PRC_NCMDS || inetctlerrmap[cmd] == 0) return; if (ip) { s = splnet(); uh = (struct udphdr *)((caddr_t)ip + (ip->ip_hl << 2)); INP_INFO_RLOCK(&udbinfo); inp = in_pcblookup_hash(&udbinfo, faddr, uh->uh_dport, ip->ip_src, uh->uh_sport, 0, NULL); if (inp != NULL) { INP_LOCK(inp); if (inp->inp_socket != NULL) { (*notify)(inp, inetctlerrmap[cmd]); } INP_UNLOCK(inp); } INP_INFO_RUNLOCK(&udbinfo); splx(s); } else in_pcbnotifyall(&udbinfo, faddr, inetctlerrmap[cmd], notify); } static int udp_pcblist(SYSCTL_HANDLER_ARGS) { int error, i, n, s; struct inpcb *inp, **inp_list; inp_gen_t gencnt; struct xinpgen xig; /* * The process of preparing the TCB list is too time-consuming and * resource-intensive to repeat twice on every request. */ if (req->oldptr == 0) { n = udbinfo.ipi_count; req->oldidx = 2 * (sizeof xig) + (n + n/8) * sizeof(struct xinpcb); return 0; } if (req->newptr != 0) return EPERM; /* * OK, now we're committed to doing something. */ s = splnet(); INP_INFO_RLOCK(&udbinfo); gencnt = udbinfo.ipi_gencnt; n = udbinfo.ipi_count; INP_INFO_RUNLOCK(&udbinfo); splx(s); error = sysctl_wire_old_buffer(req, 2 * (sizeof xig) + n * sizeof(struct xinpcb)); if (error != 0) return (error); xig.xig_len = sizeof xig; xig.xig_count = n; xig.xig_gen = gencnt; xig.xig_sogen = so_gencnt; error = SYSCTL_OUT(req, &xig, sizeof xig); if (error) return error; inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK); if (inp_list == 0) return ENOMEM; s = splnet(); INP_INFO_RLOCK(&udbinfo); for (inp = LIST_FIRST(udbinfo.listhead), i = 0; inp && i < n; inp = LIST_NEXT(inp, inp_list)) { INP_LOCK(inp); if (inp->inp_gencnt <= gencnt && cr_canseesocket(req->td->td_ucred, inp->inp_socket) == 0) inp_list[i++] = inp; INP_UNLOCK(inp); } INP_INFO_RUNLOCK(&udbinfo); splx(s); n = i; error = 0; for (i = 0; i < n; i++) { inp = inp_list[i]; if (inp->inp_gencnt <= gencnt) { struct xinpcb xi; xi.xi_len = sizeof xi; /* XXX should avoid extra copy */ bcopy(inp, &xi.xi_inp, sizeof *inp); if (inp->inp_socket) sotoxsocket(inp->inp_socket, &xi.xi_socket); xi.xi_inp.inp_gencnt = inp->inp_gencnt; error = SYSCTL_OUT(req, &xi, sizeof xi); } } if (!error) { /* * Give the user an updated idea of our state. * If the generation differs from what we told * her before, she knows that something happened * while we were processing this request, and it * might be necessary to retry. */ s = splnet(); INP_INFO_RLOCK(&udbinfo); xig.xig_gen = udbinfo.ipi_gencnt; xig.xig_sogen = so_gencnt; xig.xig_count = udbinfo.ipi_count; INP_INFO_RUNLOCK(&udbinfo); splx(s); error = SYSCTL_OUT(req, &xig, sizeof xig); } free(inp_list, M_TEMP); return error; } SYSCTL_PROC(_net_inet_udp, UDPCTL_PCBLIST, pcblist, CTLFLAG_RD, 0, 0, udp_pcblist, "S,xinpcb", "List of active UDP sockets"); static int udp_getcred(SYSCTL_HANDLER_ARGS) { struct xucred xuc; struct sockaddr_in addrs[2]; struct inpcb *inp; int error, s; error = suser_cred(req->td->td_ucred, PRISON_ROOT); if (error) return (error); error = SYSCTL_IN(req, addrs, sizeof(addrs)); if (error) return (error); s = splnet(); INP_INFO_RLOCK(&udbinfo); inp = in_pcblookup_hash(&udbinfo, addrs[1].sin_addr, addrs[1].sin_port, addrs[0].sin_addr, addrs[0].sin_port, 1, NULL); if (inp == NULL || inp->inp_socket == NULL) { error = ENOENT; goto out; } error = cr_canseesocket(req->td->td_ucred, inp->inp_socket); if (error) goto out; cru2x(inp->inp_socket->so_cred, &xuc); out: INP_INFO_RUNLOCK(&udbinfo); splx(s); if (error == 0) error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred)); return (error); } SYSCTL_PROC(_net_inet_udp, OID_AUTO, getcred, CTLTYPE_OPAQUE|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0, udp_getcred, "S,xucred", "Get the xucred of a UDP connection"); static int udp_output(inp, m, addr, control, td) register struct inpcb *inp; struct mbuf *m; struct sockaddr *addr; struct mbuf *control; struct thread *td; { register struct udpiphdr *ui; register int len = m->m_pkthdr.len; struct in_addr faddr, laddr; struct cmsghdr *cm; struct sockaddr_in *sin, src; int error = 0; int ipflags; u_short fport, lport; INP_LOCK_ASSERT(inp); #ifdef MAC mac_create_mbuf_from_inpcb(inp, m); #endif if (len + sizeof(struct udpiphdr) > IP_MAXPACKET) { error = EMSGSIZE; if (control) m_freem(control); goto release; } src.sin_addr.s_addr = INADDR_ANY; if (control != NULL) { /* * XXX: Currently, we assume all the optional information * is stored in a single mbuf. */ if (control->m_next) { error = EINVAL; m_freem(control); goto release; } for (; control->m_len > 0; control->m_data += CMSG_ALIGN(cm->cmsg_len), control->m_len -= CMSG_ALIGN(cm->cmsg_len)) { cm = mtod(control, struct cmsghdr *); if (control->m_len < sizeof(*cm) || cm->cmsg_len == 0 || cm->cmsg_len > control->m_len) { error = EINVAL; break; } if (cm->cmsg_level != IPPROTO_IP) continue; switch (cm->cmsg_type) { case IP_SENDSRCADDR: if (cm->cmsg_len != CMSG_LEN(sizeof(struct in_addr))) { error = EINVAL; break; } bzero(&src, sizeof(src)); src.sin_family = AF_INET; src.sin_len = sizeof(src); src.sin_port = inp->inp_lport; src.sin_addr = *(struct in_addr *)CMSG_DATA(cm); break; default: error = ENOPROTOOPT; break; } if (error) break; } m_freem(control); } if (error) goto release; laddr = inp->inp_laddr; lport = inp->inp_lport; if (src.sin_addr.s_addr != INADDR_ANY) { if (lport == 0) { error = EINVAL; goto release; } error = in_pcbbind_setup(inp, (struct sockaddr *)&src, &laddr.s_addr, &lport, td->td_ucred); if (error) goto release; } if (addr) { sin = (struct sockaddr_in *)addr; if (td && jailed(td->td_ucred)) prison_remote_ip(td->td_ucred, 0, &sin->sin_addr.s_addr); if (inp->inp_faddr.s_addr != INADDR_ANY) { error = EISCONN; goto release; } error = in_pcbconnect_setup(inp, addr, &laddr.s_addr, &lport, &faddr.s_addr, &fport, NULL, td->td_ucred); if (error) goto release; /* Commit the local port if newly assigned. */ if (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0) { inp->inp_lport = lport; if (in_pcbinshash(inp) != 0) { inp->inp_lport = 0; error = EAGAIN; goto release; } inp->inp_flags |= INP_ANONPORT; } } else { faddr = inp->inp_faddr; fport = inp->inp_fport; if (faddr.s_addr == INADDR_ANY) { error = ENOTCONN; goto release; } } /* * Calculate data length and get a mbuf * for UDP and IP headers. */ M_PREPEND(m, sizeof(struct udpiphdr), M_DONTWAIT); if (m == 0) { error = ENOBUFS; goto release; } /* * Fill in mbuf with extended UDP header * and addresses and length put into network format. */ ui = mtod(m, struct udpiphdr *); bzero(ui->ui_x1, sizeof(ui->ui_x1)); /* XXX still needed? */ ui->ui_pr = IPPROTO_UDP; ui->ui_src = laddr; ui->ui_dst = faddr; ui->ui_sport = lport; ui->ui_dport = fport; ui->ui_ulen = htons((u_short)len + sizeof(struct udphdr)); ipflags = inp->inp_socket->so_options & (SO_DONTROUTE | SO_BROADCAST); if (inp->inp_flags & INP_ONESBCAST) ipflags |= IP_SENDONES; /* * Set up checksum and output datagram. */ if (udpcksum) { if (inp->inp_flags & INP_ONESBCAST) faddr.s_addr = INADDR_BROADCAST; ui->ui_sum = in_pseudo(ui->ui_src.s_addr, faddr.s_addr, htons((u_short)len + sizeof(struct udphdr) + IPPROTO_UDP)); m->m_pkthdr.csum_flags = CSUM_UDP; m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); } else { ui->ui_sum = 0; } ((struct ip *)ui)->ip_len = sizeof (struct udpiphdr) + len; ((struct ip *)ui)->ip_ttl = inp->inp_ip_ttl; /* XXX */ ((struct ip *)ui)->ip_tos = inp->inp_ip_tos; /* XXX */ udpstat.udps_opackets++; error = ip_output(m, inp->inp_options, NULL, ipflags, inp->inp_moptions, inp); return (error); release: m_freem(m); return (error); } u_long udp_sendspace = 9216; /* really max datagram size */ /* 40 1K datagrams */ SYSCTL_INT(_net_inet_udp, UDPCTL_MAXDGRAM, maxdgram, CTLFLAG_RW, &udp_sendspace, 0, "Maximum outgoing UDP datagram size"); u_long udp_recvspace = 40 * (1024 + #ifdef INET6 sizeof(struct sockaddr_in6) #else sizeof(struct sockaddr_in) #endif ); SYSCTL_INT(_net_inet_udp, UDPCTL_RECVSPACE, recvspace, CTLFLAG_RW, &udp_recvspace, 0, "Maximum space for incoming UDP datagrams"); static int udp_abort(struct socket *so) { struct inpcb *inp; int s; INP_INFO_WLOCK(&udbinfo); inp = sotoinpcb(so); if (inp == 0) { INP_INFO_WUNLOCK(&udbinfo); return EINVAL; /* ??? possible? panic instead? */ } INP_LOCK(inp); soisdisconnected(so); s = splnet(); in_pcbdetach(inp); INP_INFO_WUNLOCK(&udbinfo); splx(s); return 0; } static int udp_attach(struct socket *so, int proto, struct thread *td) { struct inpcb *inp; int s, error; INP_INFO_WLOCK(&udbinfo); inp = sotoinpcb(so); if (inp != 0) { INP_INFO_WUNLOCK(&udbinfo); return EINVAL; } error = soreserve(so, udp_sendspace, udp_recvspace); if (error) { INP_INFO_WUNLOCK(&udbinfo); return error; } s = splnet(); error = in_pcballoc(so, &udbinfo, "udpinp"); splx(s); if (error) { INP_INFO_WUNLOCK(&udbinfo); return error; } inp = (struct inpcb *)so->so_pcb; INP_LOCK(inp); INP_INFO_WUNLOCK(&udbinfo); inp->inp_vflag |= INP_IPV4; inp->inp_ip_ttl = ip_defttl; INP_UNLOCK(inp); return 0; } static int udp_bind(struct socket *so, struct sockaddr *nam, struct thread *td) { struct inpcb *inp; int s, error; INP_INFO_WLOCK(&udbinfo); inp = sotoinpcb(so); if (inp == 0) { INP_INFO_WUNLOCK(&udbinfo); return EINVAL; } INP_LOCK(inp); s = splnet(); error = in_pcbbind(inp, nam, td->td_ucred); splx(s); INP_UNLOCK(inp); INP_INFO_WUNLOCK(&udbinfo); return error; } static int udp_connect(struct socket *so, struct sockaddr *nam, struct thread *td) { struct inpcb *inp; int s, error; struct sockaddr_in *sin; INP_INFO_WLOCK(&udbinfo); inp = sotoinpcb(so); if (inp == 0) { INP_INFO_WUNLOCK(&udbinfo); return EINVAL; } INP_LOCK(inp); if (inp->inp_faddr.s_addr != INADDR_ANY) { INP_UNLOCK(inp); INP_INFO_WUNLOCK(&udbinfo); return EISCONN; } s = splnet(); sin = (struct sockaddr_in *)nam; if (td && jailed(td->td_ucred)) prison_remote_ip(td->td_ucred, 0, &sin->sin_addr.s_addr); error = in_pcbconnect(inp, nam, td->td_ucred); splx(s); if (error == 0) soisconnected(so); INP_UNLOCK(inp); INP_INFO_WUNLOCK(&udbinfo); return error; } static int udp_detach(struct socket *so) { struct inpcb *inp; int s; INP_INFO_WLOCK(&udbinfo); inp = sotoinpcb(so); if (inp == 0) { INP_INFO_WUNLOCK(&udbinfo); return EINVAL; } INP_LOCK(inp); s = splnet(); in_pcbdetach(inp); INP_INFO_WUNLOCK(&udbinfo); splx(s); return 0; } static int udp_disconnect(struct socket *so) { struct inpcb *inp; int s; INP_INFO_WLOCK(&udbinfo); inp = sotoinpcb(so); if (inp == 0) { INP_INFO_WUNLOCK(&udbinfo); return EINVAL; } INP_LOCK(inp); if (inp->inp_faddr.s_addr == INADDR_ANY) { INP_INFO_WUNLOCK(&udbinfo); INP_UNLOCK(inp); return ENOTCONN; } s = splnet(); in_pcbdisconnect(inp); inp->inp_laddr.s_addr = INADDR_ANY; INP_UNLOCK(inp); INP_INFO_WUNLOCK(&udbinfo); splx(s); so->so_state &= ~SS_ISCONNECTED; /* XXX */ return 0; } static int udp_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *addr, struct mbuf *control, struct thread *td) { struct inpcb *inp; int ret; INP_INFO_WLOCK(&udbinfo); inp = sotoinpcb(so); if (inp == 0) { INP_INFO_WUNLOCK(&udbinfo); m_freem(m); return EINVAL; } INP_LOCK(inp); ret = udp_output(inp, m, addr, control, td); INP_UNLOCK(inp); INP_INFO_WUNLOCK(&udbinfo); return ret; } int udp_shutdown(struct socket *so) { struct inpcb *inp; INP_INFO_RLOCK(&udbinfo); inp = sotoinpcb(so); if (inp == 0) { INP_INFO_RUNLOCK(&udbinfo); return EINVAL; } INP_LOCK(inp); INP_INFO_RUNLOCK(&udbinfo); socantsendmore(so); INP_UNLOCK(inp); return 0; } /* * This is the wrapper function for in_setsockaddr. We just pass down * the pcbinfo for in_setsockaddr to lock. We don't want to do the locking * here because in_setsockaddr will call malloc and might block. */ static int udp_sockaddr(struct socket *so, struct sockaddr **nam) { return (in_setsockaddr(so, nam, &udbinfo)); } /* * This is the wrapper function for in_setpeeraddr. We just pass down * the pcbinfo for in_setpeeraddr to lock. */ static int udp_peeraddr(struct socket *so, struct sockaddr **nam) { return (in_setpeeraddr(so, nam, &udbinfo)); } struct pr_usrreqs udp_usrreqs = { udp_abort, pru_accept_notsupp, udp_attach, udp_bind, udp_connect, pru_connect2_notsupp, in_control, udp_detach, udp_disconnect, pru_listen_notsupp, udp_peeraddr, pru_rcvd_notsupp, pru_rcvoob_notsupp, udp_send, pru_sense_null, udp_shutdown, udp_sockaddr, sosend, soreceive, sopoll, in_pcbsosetlabel };