Index: head/sys/kern/uipc_socket.c =================================================================== --- head/sys/kern/uipc_socket.c (revision 132017) +++ head/sys/kern/uipc_socket.c (revision 132018) @@ -1,2190 +1,2209 @@ /* * Copyright (c) 2004 The FreeBSD Foundation * Copyright (c) 2004 Robert Watson * Copyright (c) 1982, 1986, 1988, 1990, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_mac.h" #include "opt_zero.h" #include #include #include #include #include #include #include #include #include #include #include /* for struct knote */ #include #include #include #include #include #include #include #include #include #include #include #include #include static int soreceive_rcvoob(struct socket *so, struct uio *uio, int flags); #ifdef INET static int do_setopt_accept_filter(struct socket *so, struct sockopt *sopt); #endif static void filt_sordetach(struct knote *kn); static int filt_soread(struct knote *kn, long hint); static void filt_sowdetach(struct knote *kn); static int filt_sowrite(struct knote *kn, long hint); static int filt_solisten(struct knote *kn, long hint); static struct filterops solisten_filtops = { 1, NULL, filt_sordetach, filt_solisten }; static struct filterops soread_filtops = { 1, NULL, filt_sordetach, filt_soread }; static struct filterops sowrite_filtops = { 1, NULL, filt_sowdetach, filt_sowrite }; uma_zone_t socket_zone; so_gen_t so_gencnt; /* generation count for sockets */ MALLOC_DEFINE(M_SONAME, "soname", "socket name"); MALLOC_DEFINE(M_PCB, "pcb", "protocol control block"); SYSCTL_DECL(_kern_ipc); static int somaxconn = SOMAXCONN; SYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn, CTLFLAG_RW, &somaxconn, 0, "Maximum pending socket connection queue size"); static int numopensockets; SYSCTL_INT(_kern_ipc, OID_AUTO, numopensockets, CTLFLAG_RD, &numopensockets, 0, "Number of open sockets"); #ifdef ZERO_COPY_SOCKETS /* These aren't static because they're used in other files. */ int so_zero_copy_send = 1; int so_zero_copy_receive = 1; SYSCTL_NODE(_kern_ipc, OID_AUTO, zero_copy, CTLFLAG_RD, 0, "Zero copy controls"); SYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, receive, CTLFLAG_RW, &so_zero_copy_receive, 0, "Enable zero copy receive"); SYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, send, CTLFLAG_RW, &so_zero_copy_send, 0, "Enable zero copy send"); #endif /* ZERO_COPY_SOCKETS */ /* * accept_mtx locks down per-socket fields relating to accept queues. See * socketvar.h for an annotation of the protected fields of struct socket. */ struct mtx accept_mtx; MTX_SYSINIT(accept_mtx, &accept_mtx, "accept", MTX_DEF); /* * so_global_mtx protects so_gencnt, numopensockets, and the per-socket * so_gencnt field. * * XXXRW: These variables might be better manipulated using atomic operations * for improved efficiency. */ static struct mtx so_global_mtx; MTX_SYSINIT(so_global_mtx, &so_global_mtx, "so_glabel", MTX_DEF); /* * Socket operation routines. * These routines are called by the routines in * sys_socket.c or from a system process, and * implement the semantics of socket operations by * switching out to the protocol specific routines. */ /* * Get a socket structure from our zone, and initialize it. * Note that it would probably be better to allocate socket * and PCB at the same time, but I'm not convinced that all * the protocols can be easily modified to do this. * * soalloc() returns a socket with a ref count of 0. */ struct socket * soalloc(int mflags) { struct socket *so; #ifdef MAC int error; #endif so = uma_zalloc(socket_zone, mflags | M_ZERO); if (so != NULL) { #ifdef MAC error = mac_init_socket(so, mflags); if (error != 0) { uma_zfree(socket_zone, so); so = NULL; return so; } #endif SOCKBUF_LOCK_INIT(&so->so_snd, "so_snd"); SOCKBUF_LOCK_INIT(&so->so_rcv, "so_rcv"); /* sx_init(&so->so_sxlock, "socket sxlock"); */ TAILQ_INIT(&so->so_aiojobq); mtx_lock(&so_global_mtx); so->so_gencnt = ++so_gencnt; ++numopensockets; mtx_unlock(&so_global_mtx); } return so; } /* * socreate returns a socket with a ref count of 1. The socket should be * closed with soclose(). */ int socreate(dom, aso, type, proto, cred, td) int dom; struct socket **aso; int type; int proto; struct ucred *cred; struct thread *td; { struct protosw *prp; struct socket *so; int error; if (proto) prp = pffindproto(dom, proto, type); else prp = pffindtype(dom, type); if (prp == NULL || prp->pr_usrreqs->pru_attach == NULL) return (EPROTONOSUPPORT); if (jailed(cred) && jail_socket_unixiproute_only && prp->pr_domain->dom_family != PF_LOCAL && prp->pr_domain->dom_family != PF_INET && prp->pr_domain->dom_family != PF_ROUTE) { return (EPROTONOSUPPORT); } if (prp->pr_type != type) return (EPROTOTYPE); so = soalloc(M_WAITOK); if (so == NULL) return (ENOBUFS); TAILQ_INIT(&so->so_incomp); TAILQ_INIT(&so->so_comp); so->so_type = type; so->so_cred = crhold(cred); so->so_proto = prp; #ifdef MAC mac_create_socket(cred, so); #endif SOCK_LOCK(so); soref(so); SOCK_UNLOCK(so); error = (*prp->pr_usrreqs->pru_attach)(so, proto, td); if (error) { SOCK_LOCK(so); so->so_state |= SS_NOFDREF; sorele(so); return (error); } *aso = so; return (0); } int sobind(so, nam, td) struct socket *so; struct sockaddr *nam; struct thread *td; { return ((*so->so_proto->pr_usrreqs->pru_bind)(so, nam, td)); } void sodealloc(struct socket *so) { KASSERT(so->so_count == 0, ("sodealloc(): so_count %d", so->so_count)); mtx_lock(&so_global_mtx); so->so_gencnt = ++so_gencnt; mtx_unlock(&so_global_mtx); if (so->so_rcv.sb_hiwat) (void)chgsbsize(so->so_cred->cr_uidinfo, &so->so_rcv.sb_hiwat, 0, RLIM_INFINITY); if (so->so_snd.sb_hiwat) (void)chgsbsize(so->so_cred->cr_uidinfo, &so->so_snd.sb_hiwat, 0, RLIM_INFINITY); #ifdef INET /* remove acccept filter if one is present. */ if (so->so_accf != NULL) do_setopt_accept_filter(so, NULL); #endif #ifdef MAC mac_destroy_socket(so); #endif crfree(so->so_cred); SOCKBUF_LOCK_DESTROY(&so->so_snd); SOCKBUF_LOCK_DESTROY(&so->so_rcv); /* sx_destroy(&so->so_sxlock); */ uma_zfree(socket_zone, so); /* * XXXRW: Seems like a shame to grab the mutex again down here, but * we don't want to decrement the socket count until after we free * the socket, and we can't increment the gencnt on the socket after * we free, it so... */ mtx_lock(&so_global_mtx); --numopensockets; mtx_unlock(&so_global_mtx); } int solisten(so, backlog, td) struct socket *so; int backlog; struct thread *td; { int error; /* * XXXRW: Ordering issue here -- perhaps we need to set * SO_ACCEPTCONN before the call to pru_listen()? * XXXRW: General atomic test-and-set concerns here also. */ if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) return (EINVAL); error = (*so->so_proto->pr_usrreqs->pru_listen)(so, td); if (error) return (error); ACCEPT_LOCK(); if (TAILQ_EMPTY(&so->so_comp)) { SOCK_LOCK(so); so->so_options |= SO_ACCEPTCONN; SOCK_UNLOCK(so); } if (backlog < 0 || backlog > somaxconn) backlog = somaxconn; so->so_qlimit = backlog; ACCEPT_UNLOCK(); return (0); } void sofree(so) struct socket *so; { struct socket *head; KASSERT(so->so_count == 0, ("socket %p so_count not 0", so)); SOCK_LOCK_ASSERT(so); if (so->so_pcb != NULL || (so->so_state & SS_NOFDREF) == 0) { SOCK_UNLOCK(so); return; } SOCK_UNLOCK(so); ACCEPT_LOCK(); head = so->so_head; if (head != NULL) { KASSERT((so->so_qstate & SQ_COMP) != 0 || (so->so_qstate & SQ_INCOMP) != 0, ("sofree: so_head != NULL, but neither SQ_COMP nor " "SQ_INCOMP")); KASSERT((so->so_qstate & SQ_COMP) == 0 || (so->so_qstate & SQ_INCOMP) == 0, ("sofree: so->so_qstate is SQ_COMP and also SQ_INCOMP")); /* * accept(2) is responsible draining the completed * connection queue and freeing those sockets, so * we just return here if this socket is currently * on the completed connection queue. Otherwise, * accept(2) may hang after select(2) has indicating * that a listening socket was ready. If it's an * incomplete connection, we remove it from the queue * and free it; otherwise, it won't be released until * the listening socket is closed. */ if ((so->so_qstate & SQ_COMP) != 0) { ACCEPT_UNLOCK(); return; } TAILQ_REMOVE(&head->so_incomp, so, so_list); head->so_incqlen--; so->so_qstate &= ~SQ_INCOMP; so->so_head = NULL; } KASSERT((so->so_qstate & SQ_COMP) == 0 && (so->so_qstate & SQ_INCOMP) == 0, ("sofree: so_head == NULL, but still SQ_COMP(%d) or SQ_INCOMP(%d)", so->so_qstate & SQ_COMP, so->so_qstate & SQ_INCOMP)); ACCEPT_UNLOCK(); SOCKBUF_LOCK(&so->so_snd); so->so_snd.sb_flags |= SB_NOINTR; (void)sblock(&so->so_snd, M_WAITOK); /* * socantsendmore_locked() drops the socket buffer mutex so that it * can safely perform wakeups. Re-acquire the mutex before * continuing. */ socantsendmore_locked(so); SOCKBUF_LOCK(&so->so_snd); sbunlock(&so->so_snd); sbrelease_locked(&so->so_snd, so); SOCKBUF_UNLOCK(&so->so_snd); sorflush(so); sodealloc(so); } /* * Close a socket on last file table reference removal. * Initiate disconnect if connected. * Free socket when disconnect complete. * * This function will sorele() the socket. Note that soclose() may be * called prior to the ref count reaching zero. The actual socket * structure will not be freed until the ref count reaches zero. */ int soclose(so) struct socket *so; { int error = 0; funsetown(&so->so_sigio); if (so->so_options & SO_ACCEPTCONN) { struct socket *sp; ACCEPT_LOCK(); while ((sp = TAILQ_FIRST(&so->so_incomp)) != NULL) { TAILQ_REMOVE(&so->so_incomp, sp, so_list); so->so_incqlen--; sp->so_qstate &= ~SQ_INCOMP; sp->so_head = NULL; ACCEPT_UNLOCK(); (void) soabort(sp); ACCEPT_LOCK(); } while ((sp = TAILQ_FIRST(&so->so_comp)) != NULL) { TAILQ_REMOVE(&so->so_comp, sp, so_list); so->so_qlen--; sp->so_qstate &= ~SQ_COMP; sp->so_head = NULL; ACCEPT_UNLOCK(); (void) soabort(sp); ACCEPT_LOCK(); } ACCEPT_UNLOCK(); } if (so->so_pcb == NULL) goto discard; if (so->so_state & SS_ISCONNECTED) { if ((so->so_state & SS_ISDISCONNECTING) == 0) { error = sodisconnect(so); if (error) goto drop; } if (so->so_options & SO_LINGER) { if ((so->so_state & SS_ISDISCONNECTING) && (so->so_state & SS_NBIO)) goto drop; while (so->so_state & SS_ISCONNECTED) { error = tsleep(&so->so_timeo, PSOCK | PCATCH, "soclos", so->so_linger * hz); if (error) break; } } } drop: if (so->so_pcb != NULL) { int error2 = (*so->so_proto->pr_usrreqs->pru_detach)(so); if (error == 0) error = error2; } discard: SOCK_LOCK(so); KASSERT((so->so_state & SS_NOFDREF) == 0, ("soclose: NOFDREF")); so->so_state |= SS_NOFDREF; sorele(so); return (error); } /* * soabort() must not be called with any socket locks held, as it calls * into the protocol, which will call back into the socket code causing * it to acquire additional socket locks that may cause recursion or lock * order reversals. */ int soabort(so) struct socket *so; { int error; error = (*so->so_proto->pr_usrreqs->pru_abort)(so); if (error) { SOCK_LOCK(so); sotryfree(so); /* note: does not decrement the ref count */ return error; } return (0); } int soaccept(so, nam) struct socket *so; struct sockaddr **nam; { int error; SOCK_LOCK(so); KASSERT((so->so_state & SS_NOFDREF) != 0, ("soaccept: !NOFDREF")); so->so_state &= ~SS_NOFDREF; SOCK_UNLOCK(so); error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam); return (error); } int soconnect(so, nam, td) struct socket *so; struct sockaddr *nam; struct thread *td; { int error; if (so->so_options & SO_ACCEPTCONN) return (EOPNOTSUPP); /* * If protocol is connection-based, can only connect once. * Otherwise, if connected, try to disconnect first. * This allows user to disconnect by connecting to, e.g., * a null address. */ if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) && ((so->so_proto->pr_flags & PR_CONNREQUIRED) || (error = sodisconnect(so)))) error = EISCONN; else error = (*so->so_proto->pr_usrreqs->pru_connect)(so, nam, td); return (error); } int soconnect2(so1, so2) struct socket *so1; struct socket *so2; { return ((*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2)); } int sodisconnect(so) struct socket *so; { int error; if ((so->so_state & SS_ISCONNECTED) == 0) return (ENOTCONN); if (so->so_state & SS_ISDISCONNECTING) return (EALREADY); error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so); return (error); } #define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK) /* * Send on a socket. * If send must go all at once and message is larger than * send buffering, then hard error. * Lock against other senders. * If must go all at once and not enough room now, then * inform user that this would block and do nothing. * Otherwise, if nonblocking, send as much as possible. * The data to be sent is described by "uio" if nonzero, * otherwise by the mbuf chain "top" (which must be null * if uio is not). Data provided in mbuf chain must be small * enough to send all at once. * * Returns nonzero on error, timeout or signal; callers * must check for short counts if EINTR/ERESTART are returned. * Data and control buffers are freed on return. */ #ifdef ZERO_COPY_SOCKETS struct so_zerocopy_stats{ int size_ok; int align_ok; int found_ifp; }; struct so_zerocopy_stats so_zerocp_stats = {0,0,0}; #include #include #include #include #include #include #endif /*ZERO_COPY_SOCKETS*/ int sosend(so, addr, uio, top, control, flags, td) struct socket *so; struct sockaddr *addr; struct uio *uio; struct mbuf *top; struct mbuf *control; int flags; struct thread *td; { struct mbuf **mp; struct mbuf *m; long space, len = 0, resid; int clen = 0, error, dontroute; int atomic = sosendallatonce(so) || top; #ifdef ZERO_COPY_SOCKETS int cow_send; #endif /* ZERO_COPY_SOCKETS */ if (uio != NULL) resid = uio->uio_resid; else resid = top->m_pkthdr.len; /* * In theory resid should be unsigned. * However, space must be signed, as it might be less than 0 * if we over-committed, and we must use a signed comparison * of space and resid. On the other hand, a negative resid * causes us to loop sending 0-length segments to the protocol. * * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM * type sockets since that's an error. */ if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) { error = EINVAL; goto out; } dontroute = (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 && (so->so_proto->pr_flags & PR_ATOMIC); if (td != NULL) td->td_proc->p_stats->p_ru.ru_msgsnd++; if (control != NULL) clen = control->m_len; #define snderr(errno) { error = (errno); goto release; } SOCKBUF_LOCK(&so->so_snd); restart: SOCKBUF_LOCK_ASSERT(&so->so_snd); error = sblock(&so->so_snd, SBLOCKWAIT(flags)); if (error) goto out_locked; do { SOCKBUF_LOCK_ASSERT(&so->so_snd); if (so->so_snd.sb_state & SBS_CANTSENDMORE) snderr(EPIPE); if (so->so_error) { error = so->so_error; so->so_error = 0; goto release; } if ((so->so_state & SS_ISCONNECTED) == 0) { /* * `sendto' and `sendmsg' is allowed on a connection- * based socket if it supports implied connect. * Return ENOTCONN if not connected and no address is * supplied. */ if ((so->so_proto->pr_flags & PR_CONNREQUIRED) && (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) { if ((so->so_state & SS_ISCONFIRMING) == 0 && !(resid == 0 && clen != 0)) snderr(ENOTCONN); } else if (addr == NULL) snderr(so->so_proto->pr_flags & PR_CONNREQUIRED ? ENOTCONN : EDESTADDRREQ); } space = sbspace(&so->so_snd); if (flags & MSG_OOB) space += 1024; if ((atomic && resid > so->so_snd.sb_hiwat) || clen > so->so_snd.sb_hiwat) snderr(EMSGSIZE); if (space < resid + clen && (atomic || space < so->so_snd.sb_lowat || space < clen)) { if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO)) snderr(EWOULDBLOCK); sbunlock(&so->so_snd); error = sbwait(&so->so_snd); if (error) goto out_locked; goto restart; } SOCKBUF_UNLOCK(&so->so_snd); mp = ⊤ space -= clen; do { if (uio == NULL) { /* * Data is prepackaged in "top". */ resid = 0; if (flags & MSG_EOR) top->m_flags |= M_EOR; } else do { #ifdef ZERO_COPY_SOCKETS cow_send = 0; #endif /* ZERO_COPY_SOCKETS */ if (resid >= MINCLSIZE) { #ifdef ZERO_COPY_SOCKETS if (top == NULL) { MGETHDR(m, M_TRYWAIT, MT_DATA); if (m == NULL) { error = ENOBUFS; SOCKBUF_LOCK(&so->so_snd); goto release; } m->m_pkthdr.len = 0; m->m_pkthdr.rcvif = (struct ifnet *)0; } else { MGET(m, M_TRYWAIT, MT_DATA); if (m == NULL) { error = ENOBUFS; SOCKBUF_LOCK(&so->so_snd); goto release; } } if (so_zero_copy_send && resid>=PAGE_SIZE && space>=PAGE_SIZE && uio->uio_iov->iov_len>=PAGE_SIZE) { so_zerocp_stats.size_ok++; if (!((vm_offset_t) uio->uio_iov->iov_base & PAGE_MASK)){ so_zerocp_stats.align_ok++; cow_send = socow_setup(m, uio); } } if (!cow_send) { MCLGET(m, M_TRYWAIT); if ((m->m_flags & M_EXT) == 0) { m_free(m); m = NULL; } else { len = min(min(MCLBYTES, resid), space); } } else len = PAGE_SIZE; #else /* ZERO_COPY_SOCKETS */ if (top == NULL) { m = m_getcl(M_TRYWAIT, MT_DATA, M_PKTHDR); m->m_pkthdr.len = 0; m->m_pkthdr.rcvif = (struct ifnet *)0; } else m = m_getcl(M_TRYWAIT, MT_DATA, 0); len = min(min(MCLBYTES, resid), space); #endif /* ZERO_COPY_SOCKETS */ } else { if (top == NULL) { m = m_gethdr(M_TRYWAIT, MT_DATA); m->m_pkthdr.len = 0; m->m_pkthdr.rcvif = (struct ifnet *)0; len = min(min(MHLEN, resid), space); /* * For datagram protocols, leave room * for protocol headers in first mbuf. */ if (atomic && m && len < MHLEN) MH_ALIGN(m, len); } else { m = m_get(M_TRYWAIT, MT_DATA); len = min(min(MLEN, resid), space); } } if (m == NULL) { error = ENOBUFS; SOCKBUF_LOCK(&so->so_snd); goto release; } space -= len; #ifdef ZERO_COPY_SOCKETS if (cow_send) error = 0; else #endif /* ZERO_COPY_SOCKETS */ error = uiomove(mtod(m, void *), (int)len, uio); resid = uio->uio_resid; m->m_len = len; *mp = m; top->m_pkthdr.len += len; if (error) { SOCKBUF_LOCK(&so->so_snd); goto release; } mp = &m->m_next; if (resid <= 0) { if (flags & MSG_EOR) top->m_flags |= M_EOR; break; } } while (space > 0 && atomic); if (dontroute) { SOCK_LOCK(so); so->so_options |= SO_DONTROUTE; SOCK_UNLOCK(so); } /* * XXX all the SBS_CANTSENDMORE checks previously * done could be out of date. We could have recieved * a reset packet in an interrupt or maybe we slept * while doing page faults in uiomove() etc. We could * probably recheck again inside the splnet() protection * here, but there are probably other places that this * also happens. We must rethink this. */ error = (*so->so_proto->pr_usrreqs->pru_send)(so, (flags & MSG_OOB) ? PRUS_OOB : /* * If the user set MSG_EOF, the protocol * understands this flag and nothing left to * send then use PRU_SEND_EOF instead of PRU_SEND. */ ((flags & MSG_EOF) && (so->so_proto->pr_flags & PR_IMPLOPCL) && (resid <= 0)) ? PRUS_EOF : /* If there is more to send set PRUS_MORETOCOME */ (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0, top, addr, control, td); if (dontroute) { SOCK_LOCK(so); so->so_options &= ~SO_DONTROUTE; SOCK_UNLOCK(so); } clen = 0; control = NULL; top = NULL; mp = ⊤ if (error) { SOCKBUF_LOCK(&so->so_snd); goto release; } } while (resid && space > 0); SOCKBUF_LOCK(&so->so_snd); } while (resid); release: SOCKBUF_LOCK_ASSERT(&so->so_snd); sbunlock(&so->so_snd); out_locked: SOCKBUF_LOCK_ASSERT(&so->so_snd); SOCKBUF_UNLOCK(&so->so_snd); out: if (top != NULL) m_freem(top); if (control != NULL) m_freem(control); return (error); } /* * The part of soreceive() that implements reading non-inline out-of-band * data from a socket. For more complete comments, see soreceive(), from * which this code originated. * * XXXRW: Note that soreceive_rcvoob(), unlike the remainder of soreiceve(), * is unable to return an mbuf chain to the caller. */ static int soreceive_rcvoob(so, uio, flags) struct socket *so; struct uio *uio; int flags; { struct protosw *pr = so->so_proto; struct mbuf *m; int error; KASSERT(flags & MSG_OOB, ("soreceive_rcvoob: (flags & MSG_OOB) == 0")); m = m_get(M_TRYWAIT, MT_DATA); if (m == NULL) return (ENOBUFS); error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK); if (error) goto bad; do { #ifdef ZERO_COPY_SOCKETS if (so_zero_copy_receive) { vm_page_t pg; int disposable; if ((m->m_flags & M_EXT) && (m->m_ext.ext_type == EXT_DISPOSABLE)) disposable = 1; else disposable = 0; pg = PHYS_TO_VM_PAGE(vtophys(mtod(m, caddr_t))); if (uio->uio_offset == -1) uio->uio_offset =IDX_TO_OFF(pg->pindex); error = uiomoveco(mtod(m, void *), min(uio->uio_resid, m->m_len), uio, pg->object, disposable); } else #endif /* ZERO_COPY_SOCKETS */ error = uiomove(mtod(m, void *), (int) min(uio->uio_resid, m->m_len), uio); m = m_free(m); } while (uio->uio_resid && error == 0 && m); bad: if (m != NULL) m_freem(m); return (error); } /* * Following replacement or removal of the first mbuf on the first mbuf chain * of a socket buffer, push necessary state changes back into the socket * buffer so that other consumers see the values consistently. 'nextrecord' * is the callers locally stored value of the original value of * sb->sb_mb->m_nextpkt which must be restored when the lead mbuf changes. * NOTE: 'nextrecord' may be NULL. */ static __inline void sockbuf_pushsync(struct sockbuf *sb, struct mbuf *nextrecord) { SOCKBUF_LOCK_ASSERT(sb); /* * First, update for the new value of nextrecord. If necessary, make * it the first record. */ if (sb->sb_mb != NULL) sb->sb_mb->m_nextpkt = nextrecord; else sb->sb_mb = nextrecord; /* * Now update any dependent socket buffer fields to reflect the new * state. This is an expanded inline of SB_EMPTY_FIXUP(), with the * addition of a second clause that takes care of the case where * sb_mb has been updated, but remains the last record. */ if (sb->sb_mb == NULL) { sb->sb_mbtail = NULL; sb->sb_lastrecord = NULL; } else if (sb->sb_mb->m_nextpkt == NULL) sb->sb_lastrecord = sb->sb_mb; } /* * Implement receive operations on a socket. * We depend on the way that records are added to the sockbuf * by sbappend*. In particular, each record (mbufs linked through m_next) * must begin with an address if the protocol so specifies, * followed by an optional mbuf or mbufs containing ancillary data, * and then zero or more mbufs of data. * In order to avoid blocking network interrupts for the entire time here, * we splx() while doing the actual copy to user space. * Although the sockbuf is locked, new data may still be appended, * and thus we must maintain consistency of the sockbuf during that time. * * The caller may receive the data as a single mbuf chain by supplying * an mbuf **mp0 for use in returning the chain. The uio is then used * only for the count in uio_resid. */ int soreceive(so, psa, uio, mp0, controlp, flagsp) struct socket *so; struct sockaddr **psa; struct uio *uio; struct mbuf **mp0; struct mbuf **controlp; int *flagsp; { struct mbuf *m, **mp; int flags, len, error, offset; struct protosw *pr = so->so_proto; struct mbuf *nextrecord; int moff, type = 0; int orig_resid = uio->uio_resid; mp = mp0; if (psa != NULL) *psa = NULL; if (controlp != NULL) *controlp = NULL; if (flagsp != NULL) flags = *flagsp &~ MSG_EOR; else flags = 0; if (flags & MSG_OOB) return (soreceive_rcvoob(so, uio, flags)); if (mp != NULL) *mp = NULL; if (so->so_state & SS_ISCONFIRMING && uio->uio_resid) (*pr->pr_usrreqs->pru_rcvd)(so, 0); SOCKBUF_LOCK(&so->so_rcv); restart: SOCKBUF_LOCK_ASSERT(&so->so_rcv); error = sblock(&so->so_rcv, SBLOCKWAIT(flags)); if (error) goto out; m = so->so_rcv.sb_mb; /* * If we have less data than requested, block awaiting more * (subject to any timeout) if: * 1. the current count is less than the low water mark, or * 2. MSG_WAITALL is set, and it is possible to do the entire * receive operation at once if we block (resid <= hiwat). * 3. MSG_DONTWAIT is not set * If MSG_WAITALL is set but resid is larger than the receive buffer, * we have to do the receive in sections, and thus risk returning * a short count if a timeout or signal occurs after we start. */ if (m == NULL || (((flags & MSG_DONTWAIT) == 0 && so->so_rcv.sb_cc < uio->uio_resid) && (so->so_rcv.sb_cc < so->so_rcv.sb_lowat || ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) && m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) { KASSERT(m != NULL || !so->so_rcv.sb_cc, ("receive: m == %p so->so_rcv.sb_cc == %u", m, so->so_rcv.sb_cc)); if (so->so_error) { if (m != NULL) goto dontblock; error = so->so_error; if ((flags & MSG_PEEK) == 0) so->so_error = 0; goto release; } SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { if (m) goto dontblock; else goto release; } for (; m != NULL; m = m->m_next) if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) { m = so->so_rcv.sb_mb; goto dontblock; } if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 && (so->so_proto->pr_flags & PR_CONNREQUIRED)) { error = ENOTCONN; goto release; } if (uio->uio_resid == 0) goto release; if ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO))) { error = EWOULDBLOCK; goto release; } SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); sbunlock(&so->so_rcv); error = sbwait(&so->so_rcv); if (error) goto out; goto restart; } dontblock: /* * From this point onward, we maintain 'nextrecord' as a cache of the * pointer to the next record in the socket buffer. We must keep the * various socket buffer pointers and local stack versions of the * pointers in sync, pushing out modifications before dropping the * socket buffer mutex, and re-reading them when picking it up. * * Otherwise, we will race with the network stack appending new data * or records onto the socket buffer by using inconsistent/stale * versions of the field, possibly resulting in socket buffer * corruption. * * By holding the high-level sblock(), we prevent simultaneous * readers from pulling off the front of the socket buffer. */ SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (uio->uio_td) uio->uio_td->td_proc->p_stats->p_ru.ru_msgrcv++; KASSERT(m == so->so_rcv.sb_mb, ("soreceive: m != so->so_rcv.sb_mb")); SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); nextrecord = m->m_nextpkt; if (pr->pr_flags & PR_ADDR) { KASSERT(m->m_type == MT_SONAME, ("m->m_type == %d", m->m_type)); orig_resid = 0; if (psa != NULL) *psa = sodupsockaddr(mtod(m, struct sockaddr *), M_NOWAIT); if (flags & MSG_PEEK) { m = m->m_next; } else { sbfree(&so->so_rcv, m); so->so_rcv.sb_mb = m_free(m); m = so->so_rcv.sb_mb; sockbuf_pushsync(&so->so_rcv, nextrecord); } } /* * Process one or more MT_CONTROL mbufs present before any data mbufs * in the first mbuf chain on the socket buffer. If MSG_PEEK, we * just copy the data; if !MSG_PEEK, we call into the protocol to * perform externalization. */ if (m != NULL && m->m_type == MT_CONTROL) { struct mbuf *cm = NULL; struct mbuf **cme = &cm; do { if (flags & MSG_PEEK) { if (controlp != NULL) { *controlp = m_copy(m, 0, m->m_len); controlp = &(*controlp)->m_next; } m = m->m_next; } else { sbfree(&so->so_rcv, m); so->so_rcv.sb_mb = m->m_next; m->m_next = NULL; if (controlp) { /* * Collect mbufs for processing below. */ *cme = m; cme = &(*cme)->m_next; } else m_free(m); m = so->so_rcv.sb_mb; } } while (m != NULL && m->m_type == MT_CONTROL); if ((flags & MSG_PEEK) == 0) sockbuf_pushsync(&so->so_rcv, nextrecord); if (cm != NULL) { if (pr->pr_domain->dom_externalize != NULL) { SOCKBUF_UNLOCK(&so->so_rcv); error = (*pr->pr_domain->dom_externalize) (cm, controlp); SOCKBUF_LOCK(&so->so_rcv); } else m_freem(cm); } nextrecord = so->so_rcv.sb_mb->m_nextpkt; orig_resid = 0; } if (m != NULL) { if ((flags & MSG_PEEK) == 0) { KASSERT(m->m_nextpkt == nextrecord, ("soreceive: post-control, nextrecord !sync")); if (nextrecord == NULL) { KASSERT(so->so_rcv.sb_mb == m, ("soreceive: post-control, sb_mb!=m")); KASSERT(so->so_rcv.sb_lastrecord == m, ("soreceive: post-control, lastrecord!=m")); } } type = m->m_type; if (type == MT_OOBDATA) flags |= MSG_OOB; } else { if ((flags & MSG_PEEK) == 0) { KASSERT(so->so_rcv.sb_mb == nextrecord, ("soreceive: sb_mb != nextrecord")); if (so->so_rcv.sb_mb == NULL) { KASSERT(so->so_rcv.sb_lastrecord == NULL, ("soreceive: sb_lastercord != NULL")); } } } SOCKBUF_LOCK_ASSERT(&so->so_rcv); SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); /* * Now continue to read any data mbufs off of the head of the socket * buffer until the read request is satisfied. Note that 'type' is * used to store the type of any mbuf reads that have happened so far * such that soreceive() can stop reading if the type changes, which * causes soreceive() to return only one of regular data and inline * out-of-band data in a single socket receive operation. */ moff = 0; offset = 0; while (m != NULL && uio->uio_resid > 0 && error == 0) { /* * If the type of mbuf has changed since the last mbuf * examined ('type'), end the receive operation. */ SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (m->m_type == MT_OOBDATA) { if (type != MT_OOBDATA) break; } else if (type == MT_OOBDATA) break; else KASSERT(m->m_type == MT_DATA || m->m_type == MT_HEADER, ("m->m_type == %d", m->m_type)); so->so_rcv.sb_state &= ~SBS_RCVATMARK; len = uio->uio_resid; if (so->so_oobmark && len > so->so_oobmark - offset) len = so->so_oobmark - offset; if (len > m->m_len - moff) len = m->m_len - moff; /* * If mp is set, just pass back the mbufs. * Otherwise copy them out via the uio, then free. * Sockbuf must be consistent here (points to current mbuf, * it points to next record) when we drop priority; * we must note any additions to the sockbuf when we * block interrupts again. */ if (mp == NULL) { SOCKBUF_LOCK_ASSERT(&so->so_rcv); SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); SOCKBUF_UNLOCK(&so->so_rcv); #ifdef ZERO_COPY_SOCKETS if (so_zero_copy_receive) { vm_page_t pg; int disposable; if ((m->m_flags & M_EXT) && (m->m_ext.ext_type == EXT_DISPOSABLE)) disposable = 1; else disposable = 0; pg = PHYS_TO_VM_PAGE(vtophys(mtod(m, caddr_t) + moff)); if (uio->uio_offset == -1) uio->uio_offset =IDX_TO_OFF(pg->pindex); error = uiomoveco(mtod(m, char *) + moff, (int)len, uio,pg->object, disposable); } else #endif /* ZERO_COPY_SOCKETS */ error = uiomove(mtod(m, char *) + moff, (int)len, uio); SOCKBUF_LOCK(&so->so_rcv); if (error) goto release; } else uio->uio_resid -= len; SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (len == m->m_len - moff) { if (m->m_flags & M_EOR) flags |= MSG_EOR; if (flags & MSG_PEEK) { m = m->m_next; moff = 0; } else { nextrecord = m->m_nextpkt; sbfree(&so->so_rcv, m); if (mp != NULL) { *mp = m; mp = &m->m_next; so->so_rcv.sb_mb = m = m->m_next; *mp = NULL; } else { so->so_rcv.sb_mb = m_free(m); m = so->so_rcv.sb_mb; } if (m != NULL) { m->m_nextpkt = nextrecord; if (nextrecord == NULL) so->so_rcv.sb_lastrecord = m; } else { so->so_rcv.sb_mb = nextrecord; SB_EMPTY_FIXUP(&so->so_rcv); } SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); } } else { if (flags & MSG_PEEK) moff += len; else { if (mp != NULL) { SOCKBUF_UNLOCK(&so->so_rcv); *mp = m_copym(m, 0, len, M_TRYWAIT); SOCKBUF_LOCK(&so->so_rcv); } m->m_data += len; m->m_len -= len; so->so_rcv.sb_cc -= len; } } SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (so->so_oobmark) { if ((flags & MSG_PEEK) == 0) { so->so_oobmark -= len; if (so->so_oobmark == 0) { so->so_rcv.sb_state |= SBS_RCVATMARK; break; } } else { offset += len; if (offset == so->so_oobmark) break; } } if (flags & MSG_EOR) break; /* * If the MSG_WAITALL flag is set (for non-atomic socket), * we must not quit until "uio->uio_resid == 0" or an error * termination. If a signal/timeout occurs, return * with a short count but without error. * Keep sockbuf locked against other readers. */ while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 && !sosendallatonce(so) && nextrecord == NULL) { SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (so->so_error || so->so_rcv.sb_state & SBS_CANTRCVMORE) break; /* * Notify the protocol that some data has been * drained before blocking. */ if (pr->pr_flags & PR_WANTRCVD && so->so_pcb != NULL) { SOCKBUF_UNLOCK(&so->so_rcv); (*pr->pr_usrreqs->pru_rcvd)(so, flags); SOCKBUF_LOCK(&so->so_rcv); } SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); error = sbwait(&so->so_rcv); if (error) goto release; m = so->so_rcv.sb_mb; if (m != NULL) nextrecord = m->m_nextpkt; } } SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (m != NULL && pr->pr_flags & PR_ATOMIC) { flags |= MSG_TRUNC; if ((flags & MSG_PEEK) == 0) (void) sbdroprecord_locked(&so->so_rcv); } if ((flags & MSG_PEEK) == 0) { if (m == NULL) { /* * First part is an inline SB_EMPTY_FIXUP(). Second * part makes sure sb_lastrecord is up-to-date if * there is still data in the socket buffer. */ so->so_rcv.sb_mb = nextrecord; if (so->so_rcv.sb_mb == NULL) { so->so_rcv.sb_mbtail = NULL; so->so_rcv.sb_lastrecord = NULL; } else if (nextrecord->m_nextpkt == NULL) so->so_rcv.sb_lastrecord = nextrecord; } SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); if (pr->pr_flags & PR_WANTRCVD && so->so_pcb) { SOCKBUF_UNLOCK(&so->so_rcv); (*pr->pr_usrreqs->pru_rcvd)(so, flags); SOCKBUF_LOCK(&so->so_rcv); } } SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (orig_resid == uio->uio_resid && orig_resid && (flags & MSG_EOR) == 0 && (so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) { sbunlock(&so->so_rcv); goto restart; } if (flagsp != NULL) *flagsp |= flags; release: SOCKBUF_LOCK_ASSERT(&so->so_rcv); sbunlock(&so->so_rcv); out: SOCKBUF_LOCK_ASSERT(&so->so_rcv); SOCKBUF_UNLOCK(&so->so_rcv); return (error); } int soshutdown(so, how) struct socket *so; int how; { struct protosw *pr = so->so_proto; if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR)) return (EINVAL); if (how != SHUT_WR) sorflush(so); if (how != SHUT_RD) return ((*pr->pr_usrreqs->pru_shutdown)(so)); return (0); } void sorflush(so) struct socket *so; { struct sockbuf *sb = &so->so_rcv; struct protosw *pr = so->so_proto; struct sockbuf asb; /* * XXXRW: This is quite ugly. The existing code made a copy of the * socket buffer, then zero'd the original to clear the buffer * fields. However, with mutexes in the socket buffer, this causes * problems. We only clear the zeroable bits of the original; * however, we have to initialize and destroy the mutex in the copy * so that dom_dispose() and sbrelease() can lock t as needed. */ SOCKBUF_LOCK(sb); sb->sb_flags |= SB_NOINTR; (void) sblock(sb, M_WAITOK); /* * socantrcvmore_locked() drops the socket buffer mutex so that it * can safely perform wakeups. Re-acquire the mutex before * continuing. */ socantrcvmore_locked(so); SOCKBUF_LOCK(sb); sbunlock(sb); /* * Invalidate/clear most of the sockbuf structure, but leave * selinfo and mutex data unchanged. */ bzero(&asb, offsetof(struct sockbuf, sb_startzero)); bcopy(&sb->sb_startzero, &asb.sb_startzero, sizeof(*sb) - offsetof(struct sockbuf, sb_startzero)); bzero(&sb->sb_startzero, sizeof(*sb) - offsetof(struct sockbuf, sb_startzero)); SOCKBUF_UNLOCK(sb); SOCKBUF_LOCK_INIT(&asb, "so_rcv"); if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL) (*pr->pr_domain->dom_dispose)(asb.sb_mb); sbrelease(&asb, so); SOCKBUF_LOCK_DESTROY(&asb); } #ifdef INET static int do_setopt_accept_filter(so, sopt) struct socket *so; struct sockopt *sopt; { struct accept_filter_arg *afap = NULL; struct accept_filter *afp; struct so_accf *af = so->so_accf; int error = 0; /* do not set/remove accept filters on non listen sockets */ if ((so->so_options & SO_ACCEPTCONN) == 0) { error = EINVAL; goto out; } /* removing the filter */ if (sopt == NULL) { if (af != NULL) { if (af->so_accept_filter != NULL && af->so_accept_filter->accf_destroy != NULL) { af->so_accept_filter->accf_destroy(so); } if (af->so_accept_filter_str != NULL) { FREE(af->so_accept_filter_str, M_ACCF); } FREE(af, M_ACCF); so->so_accf = NULL; } so->so_options &= ~SO_ACCEPTFILTER; return (0); } /* adding a filter */ /* must remove previous filter first */ if (af != NULL) { error = EINVAL; goto out; } /* don't put large objects on the kernel stack */ MALLOC(afap, struct accept_filter_arg *, sizeof(*afap), M_TEMP, M_WAITOK); error = sooptcopyin(sopt, afap, sizeof *afap, sizeof *afap); afap->af_name[sizeof(afap->af_name)-1] = '\0'; afap->af_arg[sizeof(afap->af_arg)-1] = '\0'; if (error) goto out; afp = accept_filt_get(afap->af_name); if (afp == NULL) { error = ENOENT; goto out; } MALLOC(af, struct so_accf *, sizeof(*af), M_ACCF, M_WAITOK | M_ZERO); if (afp->accf_create != NULL) { if (afap->af_name[0] != '\0') { int len = strlen(afap->af_name) + 1; MALLOC(af->so_accept_filter_str, char *, len, M_ACCF, M_WAITOK); strcpy(af->so_accept_filter_str, afap->af_name); } af->so_accept_filter_arg = afp->accf_create(so, afap->af_arg); if (af->so_accept_filter_arg == NULL) { FREE(af->so_accept_filter_str, M_ACCF); FREE(af, M_ACCF); so->so_accf = NULL; error = EINVAL; goto out; } } af->so_accept_filter = afp; so->so_accf = af; so->so_options |= SO_ACCEPTFILTER; out: if (afap != NULL) FREE(afap, M_TEMP); return (error); } #endif /* INET */ /* * Perhaps this routine, and sooptcopyout(), below, ought to come in * an additional variant to handle the case where the option value needs * to be some kind of integer, but not a specific size. * In addition to their use here, these functions are also called by the * protocol-level pr_ctloutput() routines. */ int sooptcopyin(sopt, buf, len, minlen) struct sockopt *sopt; void *buf; size_t len; size_t minlen; { size_t valsize; /* * If the user gives us more than we wanted, we ignore it, * but if we don't get the minimum length the caller * wants, we return EINVAL. On success, sopt->sopt_valsize * is set to however much we actually retrieved. */ if ((valsize = sopt->sopt_valsize) < minlen) return EINVAL; if (valsize > len) sopt->sopt_valsize = valsize = len; if (sopt->sopt_td != NULL) return (copyin(sopt->sopt_val, buf, valsize)); bcopy(sopt->sopt_val, buf, valsize); return 0; } +/* + * Kernel version of setsockopt(2)/ + * XXX: optlen is size_t, not socklen_t + */ +int +kern_setsockopt(struct socket *so, int level, int optname, void *optval, + size_t optlen) +{ + struct sockopt sopt; + + sopt.sopt_level = level; + sopt.sopt_name = optname; + sopt.sopt_dir = SOPT_SET; + sopt.sopt_val = optval; + sopt.sopt_valsize = optlen; + sopt.sopt_td = NULL; + return (sosetopt(so, &sopt)); +} + int sosetopt(so, sopt) struct socket *so; struct sockopt *sopt; { int error, optval; struct linger l; struct timeval tv; u_long val; #ifdef MAC struct mac extmac; #endif error = 0; if (sopt->sopt_level != SOL_SOCKET) { if (so->so_proto && so->so_proto->pr_ctloutput) return ((*so->so_proto->pr_ctloutput) (so, sopt)); error = ENOPROTOOPT; } else { switch (sopt->sopt_name) { #ifdef INET case SO_ACCEPTFILTER: error = do_setopt_accept_filter(so, sopt); if (error) goto bad; break; #endif case SO_LINGER: error = sooptcopyin(sopt, &l, sizeof l, sizeof l); if (error) goto bad; SOCK_LOCK(so); so->so_linger = l.l_linger; if (l.l_onoff) so->so_options |= SO_LINGER; else so->so_options &= ~SO_LINGER; SOCK_UNLOCK(so); break; case SO_DEBUG: case SO_KEEPALIVE: case SO_DONTROUTE: case SO_USELOOPBACK: case SO_BROADCAST: case SO_REUSEADDR: case SO_REUSEPORT: case SO_OOBINLINE: case SO_TIMESTAMP: case SO_BINTIME: case SO_NOSIGPIPE: error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) goto bad; SOCK_LOCK(so); if (optval) so->so_options |= sopt->sopt_name; else so->so_options &= ~sopt->sopt_name; SOCK_UNLOCK(so); break; case SO_SNDBUF: case SO_RCVBUF: case SO_SNDLOWAT: case SO_RCVLOWAT: error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) goto bad; /* * Values < 1 make no sense for any of these * options, so disallow them. */ if (optval < 1) { error = EINVAL; goto bad; } switch (sopt->sopt_name) { case SO_SNDBUF: case SO_RCVBUF: if (sbreserve(sopt->sopt_name == SO_SNDBUF ? &so->so_snd : &so->so_rcv, (u_long)optval, so, curthread) == 0) { error = ENOBUFS; goto bad; } break; /* * Make sure the low-water is never greater than * the high-water. */ case SO_SNDLOWAT: SOCKBUF_LOCK(&so->so_snd); so->so_snd.sb_lowat = (optval > so->so_snd.sb_hiwat) ? so->so_snd.sb_hiwat : optval; SOCKBUF_UNLOCK(&so->so_snd); break; case SO_RCVLOWAT: SOCKBUF_LOCK(&so->so_rcv); so->so_rcv.sb_lowat = (optval > so->so_rcv.sb_hiwat) ? so->so_rcv.sb_hiwat : optval; SOCKBUF_UNLOCK(&so->so_rcv); break; } break; case SO_SNDTIMEO: case SO_RCVTIMEO: error = sooptcopyin(sopt, &tv, sizeof tv, sizeof tv); if (error) goto bad; /* assert(hz > 0); */ if (tv.tv_sec < 0 || tv.tv_sec > SHRT_MAX / hz || tv.tv_usec < 0 || tv.tv_usec >= 1000000) { error = EDOM; goto bad; } /* assert(tick > 0); */ /* assert(ULONG_MAX - SHRT_MAX >= 1000000); */ val = (u_long)(tv.tv_sec * hz) + tv.tv_usec / tick; if (val > SHRT_MAX) { error = EDOM; goto bad; } if (val == 0 && tv.tv_usec != 0) val = 1; switch (sopt->sopt_name) { case SO_SNDTIMEO: so->so_snd.sb_timeo = val; break; case SO_RCVTIMEO: so->so_rcv.sb_timeo = val; break; } break; case SO_LABEL: #ifdef MAC error = sooptcopyin(sopt, &extmac, sizeof extmac, sizeof extmac); if (error) goto bad; error = mac_setsockopt_label(sopt->sopt_td->td_ucred, so, &extmac); #else error = EOPNOTSUPP; #endif break; default: error = ENOPROTOOPT; break; } if (error == 0 && so->so_proto != NULL && so->so_proto->pr_ctloutput != NULL) { (void) ((*so->so_proto->pr_ctloutput) (so, sopt)); } } bad: return (error); } /* Helper routine for getsockopt */ int sooptcopyout(struct sockopt *sopt, const void *buf, size_t len) { int error; size_t valsize; error = 0; /* * Documented get behavior is that we always return a value, * possibly truncated to fit in the user's buffer. * Traditional behavior is that we always tell the user * precisely how much we copied, rather than something useful * like the total amount we had available for her. * Note that this interface is not idempotent; the entire answer must * generated ahead of time. */ valsize = min(len, sopt->sopt_valsize); sopt->sopt_valsize = valsize; if (sopt->sopt_val != NULL) { if (sopt->sopt_td != NULL) error = copyout(buf, sopt->sopt_val, valsize); else bcopy(buf, sopt->sopt_val, valsize); } return error; } int sogetopt(so, sopt) struct socket *so; struct sockopt *sopt; { int error, optval; struct linger l; struct timeval tv; #ifdef INET struct accept_filter_arg *afap; #endif #ifdef MAC struct mac extmac; #endif error = 0; if (sopt->sopt_level != SOL_SOCKET) { if (so->so_proto && so->so_proto->pr_ctloutput) { return ((*so->so_proto->pr_ctloutput) (so, sopt)); } else return (ENOPROTOOPT); } else { switch (sopt->sopt_name) { #ifdef INET case SO_ACCEPTFILTER: if ((so->so_options & SO_ACCEPTCONN) == 0) return (EINVAL); MALLOC(afap, struct accept_filter_arg *, sizeof(*afap), M_TEMP, M_WAITOK | M_ZERO); if ((so->so_options & SO_ACCEPTFILTER) != 0) { strcpy(afap->af_name, so->so_accf->so_accept_filter->accf_name); if (so->so_accf->so_accept_filter_str != NULL) strcpy(afap->af_arg, so->so_accf->so_accept_filter_str); } error = sooptcopyout(sopt, afap, sizeof(*afap)); FREE(afap, M_TEMP); break; #endif case SO_LINGER: /* * XXXRW: We grab the lock here to get a consistent * snapshot of both fields. This may not really * be necessary. */ SOCK_LOCK(so); l.l_onoff = so->so_options & SO_LINGER; l.l_linger = so->so_linger; SOCK_UNLOCK(so); error = sooptcopyout(sopt, &l, sizeof l); break; case SO_USELOOPBACK: case SO_DONTROUTE: case SO_DEBUG: case SO_KEEPALIVE: case SO_REUSEADDR: case SO_REUSEPORT: case SO_BROADCAST: case SO_OOBINLINE: case SO_TIMESTAMP: case SO_BINTIME: case SO_NOSIGPIPE: optval = so->so_options & sopt->sopt_name; integer: error = sooptcopyout(sopt, &optval, sizeof optval); break; case SO_TYPE: optval = so->so_type; goto integer; case SO_ERROR: optval = so->so_error; so->so_error = 0; goto integer; case SO_SNDBUF: optval = so->so_snd.sb_hiwat; goto integer; case SO_RCVBUF: optval = so->so_rcv.sb_hiwat; goto integer; case SO_SNDLOWAT: optval = so->so_snd.sb_lowat; goto integer; case SO_RCVLOWAT: optval = so->so_rcv.sb_lowat; goto integer; case SO_SNDTIMEO: case SO_RCVTIMEO: optval = (sopt->sopt_name == SO_SNDTIMEO ? so->so_snd.sb_timeo : so->so_rcv.sb_timeo); tv.tv_sec = optval / hz; tv.tv_usec = (optval % hz) * tick; error = sooptcopyout(sopt, &tv, sizeof tv); break; case SO_LABEL: #ifdef MAC error = sooptcopyin(sopt, &extmac, sizeof(extmac), sizeof(extmac)); if (error) return (error); error = mac_getsockopt_label(sopt->sopt_td->td_ucred, so, &extmac); if (error) return (error); error = sooptcopyout(sopt, &extmac, sizeof extmac); #else error = EOPNOTSUPP; #endif break; case SO_PEERLABEL: #ifdef MAC error = sooptcopyin(sopt, &extmac, sizeof(extmac), sizeof(extmac)); if (error) return (error); error = mac_getsockopt_peerlabel( sopt->sopt_td->td_ucred, so, &extmac); if (error) return (error); error = sooptcopyout(sopt, &extmac, sizeof extmac); #else error = EOPNOTSUPP; #endif break; default: error = ENOPROTOOPT; break; } return (error); } } /* XXX; prepare mbuf for (__FreeBSD__ < 3) routines. */ int soopt_getm(struct sockopt *sopt, struct mbuf **mp) { struct mbuf *m, *m_prev; int sopt_size = sopt->sopt_valsize; MGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT, MT_DATA); if (m == NULL) return ENOBUFS; if (sopt_size > MLEN) { MCLGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT); if ((m->m_flags & M_EXT) == 0) { m_free(m); return ENOBUFS; } m->m_len = min(MCLBYTES, sopt_size); } else { m->m_len = min(MLEN, sopt_size); } sopt_size -= m->m_len; *mp = m; m_prev = m; while (sopt_size) { MGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT, MT_DATA); if (m == NULL) { m_freem(*mp); return ENOBUFS; } if (sopt_size > MLEN) { MCLGET(m, sopt->sopt_td != NULL ? M_TRYWAIT : M_DONTWAIT); if ((m->m_flags & M_EXT) == 0) { m_freem(m); m_freem(*mp); return ENOBUFS; } m->m_len = min(MCLBYTES, sopt_size); } else { m->m_len = min(MLEN, sopt_size); } sopt_size -= m->m_len; m_prev->m_next = m; m_prev = m; } return 0; } /* XXX; copyin sopt data into mbuf chain for (__FreeBSD__ < 3) routines. */ int soopt_mcopyin(struct sockopt *sopt, struct mbuf *m) { struct mbuf *m0 = m; if (sopt->sopt_val == NULL) return 0; while (m != NULL && sopt->sopt_valsize >= m->m_len) { if (sopt->sopt_td != NULL) { int error; error = copyin(sopt->sopt_val, mtod(m, char *), m->m_len); if (error != 0) { m_freem(m0); return(error); } } else bcopy(sopt->sopt_val, mtod(m, char *), m->m_len); sopt->sopt_valsize -= m->m_len; sopt->sopt_val = (char *)sopt->sopt_val + m->m_len; m = m->m_next; } if (m != NULL) /* should be allocated enoughly at ip6_sooptmcopyin() */ panic("ip6_sooptmcopyin"); return 0; } /* XXX; copyout mbuf chain data into soopt for (__FreeBSD__ < 3) routines. */ int soopt_mcopyout(struct sockopt *sopt, struct mbuf *m) { struct mbuf *m0 = m; size_t valsize = 0; if (sopt->sopt_val == NULL) return 0; while (m != NULL && sopt->sopt_valsize >= m->m_len) { if (sopt->sopt_td != NULL) { int error; error = copyout(mtod(m, char *), sopt->sopt_val, m->m_len); if (error != 0) { m_freem(m0); return(error); } } else bcopy(mtod(m, char *), sopt->sopt_val, m->m_len); sopt->sopt_valsize -= m->m_len; sopt->sopt_val = (char *)sopt->sopt_val + m->m_len; valsize += m->m_len; m = m->m_next; } if (m != NULL) { /* enough soopt buffer should be given from user-land */ m_freem(m0); return(EINVAL); } sopt->sopt_valsize = valsize; return 0; } void sohasoutofband(so) struct socket *so; { if (so->so_sigio != NULL) pgsigio(&so->so_sigio, SIGURG, 0); selwakeuppri(&so->so_rcv.sb_sel, PSOCK); } int sopoll(struct socket *so, int events, struct ucred *active_cred, struct thread *td) { int revents = 0; if (events & (POLLIN | POLLRDNORM)) if (soreadable(so)) revents |= events & (POLLIN | POLLRDNORM); if (events & POLLINIGNEOF) if (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat || !TAILQ_EMPTY(&so->so_comp) || so->so_error) revents |= POLLINIGNEOF; if (events & (POLLOUT | POLLWRNORM)) if (sowriteable(so)) revents |= events & (POLLOUT | POLLWRNORM); if (events & (POLLPRI | POLLRDBAND)) if (so->so_oobmark || (so->so_rcv.sb_state & SBS_RCVATMARK)) revents |= events & (POLLPRI | POLLRDBAND); if (revents == 0) { if (events & (POLLIN | POLLINIGNEOF | POLLPRI | POLLRDNORM | POLLRDBAND)) { SOCKBUF_LOCK(&so->so_rcv); selrecord(td, &so->so_rcv.sb_sel); so->so_rcv.sb_flags |= SB_SEL; SOCKBUF_UNLOCK(&so->so_rcv); } if (events & (POLLOUT | POLLWRNORM)) { SOCKBUF_LOCK(&so->so_snd); selrecord(td, &so->so_snd.sb_sel); so->so_snd.sb_flags |= SB_SEL; SOCKBUF_UNLOCK(&so->so_snd); } } return (revents); } int soo_kqfilter(struct file *fp, struct knote *kn) { struct socket *so = kn->kn_fp->f_data; struct sockbuf *sb; switch (kn->kn_filter) { case EVFILT_READ: if (so->so_options & SO_ACCEPTCONN) kn->kn_fop = &solisten_filtops; else kn->kn_fop = &soread_filtops; sb = &so->so_rcv; break; case EVFILT_WRITE: kn->kn_fop = &sowrite_filtops; sb = &so->so_snd; break; default: return (1); } SOCKBUF_LOCK(sb); SLIST_INSERT_HEAD(&sb->sb_sel.si_note, kn, kn_selnext); sb->sb_flags |= SB_KNOTE; SOCKBUF_UNLOCK(sb); return (0); } static void filt_sordetach(struct knote *kn) { struct socket *so = kn->kn_fp->f_data; SOCKBUF_LOCK(&so->so_rcv); SLIST_REMOVE(&so->so_rcv.sb_sel.si_note, kn, knote, kn_selnext); if (SLIST_EMPTY(&so->so_rcv.sb_sel.si_note)) so->so_rcv.sb_flags &= ~SB_KNOTE; SOCKBUF_UNLOCK(&so->so_rcv); } /*ARGSUSED*/ static int filt_soread(struct knote *kn, long hint) { struct socket *so = kn->kn_fp->f_data; int need_lock, result; /* * XXXRW: Conditional locking because filt_soread() can be called * either from KNOTE() in the socket context where the socket buffer * lock is already held, or from kqueue() itself. */ need_lock = !SOCKBUF_OWNED(&so->so_rcv); if (need_lock) SOCKBUF_LOCK(&so->so_rcv); kn->kn_data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl; if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { kn->kn_flags |= EV_EOF; kn->kn_fflags = so->so_error; result = 1; } else if (so->so_error) /* temporary udp error */ result = 1; else if (kn->kn_sfflags & NOTE_LOWAT) result = (kn->kn_data >= kn->kn_sdata); else result = (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat); if (need_lock) SOCKBUF_UNLOCK(&so->so_rcv); return (result); } static void filt_sowdetach(struct knote *kn) { struct socket *so = kn->kn_fp->f_data; SOCKBUF_LOCK(&so->so_snd); SLIST_REMOVE(&so->so_snd.sb_sel.si_note, kn, knote, kn_selnext); if (SLIST_EMPTY(&so->so_snd.sb_sel.si_note)) so->so_snd.sb_flags &= ~SB_KNOTE; SOCKBUF_UNLOCK(&so->so_snd); } /*ARGSUSED*/ static int filt_sowrite(struct knote *kn, long hint) { struct socket *so = kn->kn_fp->f_data; int need_lock, result; /* * XXXRW: Conditional locking because filt_soread() can be called * either from KNOTE() in the socket context where the socket buffer * lock is already held, or from kqueue() itself. */ need_lock = !SOCKBUF_OWNED(&so->so_snd); if (need_lock) SOCKBUF_LOCK(&so->so_snd); kn->kn_data = sbspace(&so->so_snd); if (so->so_snd.sb_state & SBS_CANTSENDMORE) { kn->kn_flags |= EV_EOF; kn->kn_fflags = so->so_error; result = 1; } else if (so->so_error) /* temporary udp error */ result = 1; else if (((so->so_state & SS_ISCONNECTED) == 0) && (so->so_proto->pr_flags & PR_CONNREQUIRED)) result = 0; else if (kn->kn_sfflags & NOTE_LOWAT) result = (kn->kn_data >= kn->kn_sdata); else result = (kn->kn_data >= so->so_snd.sb_lowat); if (need_lock) SOCKBUF_UNLOCK(&so->so_snd); return (result); } /*ARGSUSED*/ static int filt_solisten(struct knote *kn, long hint) { struct socket *so = kn->kn_fp->f_data; kn->kn_data = so->so_qlen; return (! TAILQ_EMPTY(&so->so_comp)); } int socheckuid(struct socket *so, uid_t uid) { if (so == NULL) return (EPERM); if (so->so_cred->cr_uid == uid) return (0); return (EPERM); } Index: head/sys/nfsclient/nfs_socket.c =================================================================== --- head/sys/nfsclient/nfs_socket.c (revision 132017) +++ head/sys/nfsclient/nfs_socket.c (revision 132018) @@ -1,1552 +1,1560 @@ /* * Copyright (c) 1989, 1991, 1993, 1995 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Rick Macklem at The University of Guelph. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)nfs_socket.c 8.5 (Berkeley) 3/30/95 */ #include __FBSDID("$FreeBSD$"); /* * Socket operations for use by nfs */ #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define TRUE 1 #define FALSE 0 /* * Estimate rto for an nfs rpc sent via. an unreliable datagram. * Use the mean and mean deviation of rtt for the appropriate type of rpc * for the frequent rpcs and a default for the others. * The justification for doing "other" this way is that these rpcs * happen so infrequently that timer est. would probably be stale. * Also, since many of these rpcs are * non-idempotent, a conservative timeout is desired. * getattr, lookup - A+2D * read, write - A+4D * other - nm_timeo */ #define NFS_RTO(n, t) \ ((t) == 0 ? (n)->nm_timeo : \ ((t) < 3 ? \ (((((n)->nm_srtt[t-1] + 3) >> 2) + (n)->nm_sdrtt[t-1] + 1) >> 1) : \ ((((n)->nm_srtt[t-1] + 7) >> 3) + (n)->nm_sdrtt[t-1] + 1))) #define NFS_SRTT(r) (r)->r_nmp->nm_srtt[proct[(r)->r_procnum] - 1] #define NFS_SDRTT(r) (r)->r_nmp->nm_sdrtt[proct[(r)->r_procnum] - 1] /* * Defines which timer to use for the procnum. * 0 - default * 1 - getattr * 2 - lookup * 3 - read * 4 - write */ static int proct[NFS_NPROCS] = { 0, 1, 0, 2, 1, 3, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 0, 0, 0, 0, 0, }; static int nfs_realign_test; static int nfs_realign_count; static int nfs_bufpackets = 4; +static int nfs_reconnects; SYSCTL_DECL(_vfs_nfs); SYSCTL_INT(_vfs_nfs, OID_AUTO, realign_test, CTLFLAG_RW, &nfs_realign_test, 0, ""); SYSCTL_INT(_vfs_nfs, OID_AUTO, realign_count, CTLFLAG_RW, &nfs_realign_count, 0, ""); SYSCTL_INT(_vfs_nfs, OID_AUTO, bufpackets, CTLFLAG_RW, &nfs_bufpackets, 0, ""); +SYSCTL_INT(_vfs_nfs, OID_AUTO, reconnects, CTLFLAG_RD, &nfs_reconnects, 0, + "number of times the nfs client has had to reconnect"); /* * There is a congestion window for outstanding rpcs maintained per mount * point. The cwnd size is adjusted in roughly the way that: * Van Jacobson, Congestion avoidance and Control, In "Proceedings of * SIGCOMM '88". ACM, August 1988. * describes for TCP. The cwnd size is chopped in half on a retransmit timeout * and incremented by 1/cwnd when each rpc reply is received and a full cwnd * of rpcs is in progress. * (The sent count and cwnd are scaled for integer arith.) * Variants of "slow start" were tried and were found to be too much of a * performance hit (ave. rtt 3 times larger), * I suspect due to the large rtt that nfs rpcs have. */ #define NFS_CWNDSCALE 256 #define NFS_MAXCWND (NFS_CWNDSCALE * 32) #define NFS_NBACKOFF 8 static int nfs_backoff[NFS_NBACKOFF] = { 2, 4, 8, 16, 32, 64, 128, 256, }; struct callout nfs_callout; static int nfs_msg(struct thread *, const char *, const char *, int); static int nfs_rcvlock(struct nfsreq *); static void nfs_rcvunlock(struct nfsreq *); static void nfs_realign(struct mbuf **pm, int hsiz); static int nfs_receive(struct nfsreq *rep, struct sockaddr **aname, struct mbuf **mp); static int nfs_reply(struct nfsreq *); static void nfs_softterm(struct nfsreq *rep); static int nfs_reconnect(struct nfsreq *rep); /* * Initialize sockets and congestion for a new NFS connection. * We do not free the sockaddr if error. */ int nfs_connect(struct nfsmount *nmp, struct nfsreq *rep) { struct socket *so; int error, rcvreserve, sndreserve; - int pktscale; + int opt, pktscale; struct sockaddr *saddr; struct thread *td = &thread0; /* only used for socreate and sobind */ NET_ASSERT_GIANT(); nmp->nm_so = NULL; saddr = nmp->nm_nam; error = socreate(saddr->sa_family, &nmp->nm_so, nmp->nm_sotype, nmp->nm_soproto, nmp->nm_mountp->mnt_cred, td); if (error) goto bad; so = nmp->nm_so; nmp->nm_soflags = so->so_proto->pr_flags; + opt = 1; + (void)kern_setsockopt(so, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt)); + (void)kern_setsockopt(so, SOL_SOCKET, SO_REUSEPORT, &opt, sizeof(opt)); + /* * Some servers require that the client port be a reserved port number. */ if (nmp->nm_flag & NFSMNT_RESVPORT) { struct sockopt sopt; int ip, ip2, len; struct sockaddr_in6 ssin; struct sockaddr *sa; bzero(&sopt, sizeof sopt); switch(saddr->sa_family) { case AF_INET: sopt.sopt_level = IPPROTO_IP; sopt.sopt_name = IP_PORTRANGE; ip = IP_PORTRANGE_LOW; ip2 = IP_PORTRANGE_DEFAULT; len = sizeof (struct sockaddr_in); break; #ifdef INET6 case AF_INET6: sopt.sopt_level = IPPROTO_IPV6; sopt.sopt_name = IPV6_PORTRANGE; ip = IPV6_PORTRANGE_LOW; ip2 = IPV6_PORTRANGE_DEFAULT; len = sizeof (struct sockaddr_in6); break; #endif default: goto noresvport; } sa = (struct sockaddr *)&ssin; bzero(sa, len); sa->sa_len = len; sa->sa_family = saddr->sa_family; sopt.sopt_dir = SOPT_SET; sopt.sopt_val = (void *)&ip; sopt.sopt_valsize = sizeof(ip); error = sosetopt(so, &sopt); if (error) goto bad; error = sobind(so, sa, td); if (error) goto bad; ip = ip2; error = sosetopt(so, &sopt); if (error) goto bad; noresvport: ; } /* * Protocols that do not require connections may be optionally left * unconnected for servers that reply from a port other than NFS_PORT. */ if (nmp->nm_flag & NFSMNT_NOCONN) { if (nmp->nm_soflags & PR_CONNREQUIRED) { error = ENOTCONN; goto bad; } } else { error = soconnect(so, nmp->nm_nam, td); if (error) goto bad; /* * Wait for the connection to complete. Cribbed from the * connect system call but with the wait timing out so * that interruptible mounts don't hang here for a long time. */ SOCK_LOCK(so); while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) { (void) msleep(&so->so_timeo, SOCK_MTX(so), PSOCK, "nfscon", 2 * hz); if ((so->so_state & SS_ISCONNECTING) && so->so_error == 0 && rep && (error = nfs_sigintr(nmp, rep, rep->r_td)) != 0) { so->so_state &= ~SS_ISCONNECTING; SOCK_UNLOCK(so); goto bad; } } if (so->so_error) { error = so->so_error; so->so_error = 0; SOCK_UNLOCK(so); goto bad; } SOCK_UNLOCK(so); } - so->so_rcv.sb_timeo = 5 * hz; + so->so_rcv.sb_timeo = 12 * hz; so->so_snd.sb_timeo = 5 * hz; /* * Get buffer reservation size from sysctl, but impose reasonable * limits. */ pktscale = nfs_bufpackets; if (pktscale < 2) pktscale = 2; if (pktscale > 64) pktscale = 64; if (nmp->nm_sotype == SOCK_DGRAM) { sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR) * pktscale; rcvreserve = (max(nmp->nm_rsize, nmp->nm_readdirsize) + NFS_MAXPKTHDR) * pktscale; } else if (nmp->nm_sotype == SOCK_SEQPACKET) { sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR) * pktscale; rcvreserve = (max(nmp->nm_rsize, nmp->nm_readdirsize) + NFS_MAXPKTHDR) * pktscale; } else { if (nmp->nm_sotype != SOCK_STREAM) panic("nfscon sotype"); if (so->so_proto->pr_flags & PR_CONNREQUIRED) { struct sockopt sopt; int val; bzero(&sopt, sizeof sopt); sopt.sopt_dir = SOPT_SET; sopt.sopt_level = SOL_SOCKET; sopt.sopt_name = SO_KEEPALIVE; sopt.sopt_val = &val; sopt.sopt_valsize = sizeof val; val = 1; sosetopt(so, &sopt); } if (so->so_proto->pr_protocol == IPPROTO_TCP) { struct sockopt sopt; int val; bzero(&sopt, sizeof sopt); sopt.sopt_dir = SOPT_SET; sopt.sopt_level = IPPROTO_TCP; sopt.sopt_name = TCP_NODELAY; sopt.sopt_val = &val; sopt.sopt_valsize = sizeof val; val = 1; sosetopt(so, &sopt); } sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR + sizeof (u_int32_t)) * pktscale; rcvreserve = (nmp->nm_rsize + NFS_MAXPKTHDR + sizeof (u_int32_t)) * pktscale; } error = soreserve(so, sndreserve, rcvreserve); if (error) goto bad; SOCKBUF_LOCK(&so->so_rcv); so->so_rcv.sb_flags |= SB_NOINTR; SOCKBUF_UNLOCK(&so->so_rcv); SOCKBUF_LOCK(&so->so_snd); so->so_snd.sb_flags |= SB_NOINTR; SOCKBUF_UNLOCK(&so->so_snd); /* Initialize other non-zero congestion variables */ nmp->nm_srtt[0] = nmp->nm_srtt[1] = nmp->nm_srtt[2] = nmp->nm_srtt[3] = (NFS_TIMEO << 3); nmp->nm_sdrtt[0] = nmp->nm_sdrtt[1] = nmp->nm_sdrtt[2] = nmp->nm_sdrtt[3] = 0; nmp->nm_cwnd = NFS_MAXCWND / 2; /* Initial send window */ nmp->nm_sent = 0; nmp->nm_timeouts = 0; return (0); bad: nfs_disconnect(nmp); return (error); } /* * Reconnect routine: * Called when a connection is broken on a reliable protocol. * - clean up the old socket * - nfs_connect() again * - set R_MUSTRESEND for all outstanding requests on mount point * If this fails the mount point is DEAD! * nb: Must be called with the nfs_sndlock() set on the mount point. */ static int nfs_reconnect(struct nfsreq *rep) { struct nfsreq *rp; struct nfsmount *nmp = rep->r_nmp; int error; + nfs_reconnects++; nfs_disconnect(nmp); while ((error = nfs_connect(nmp, rep)) != 0) { if (error == ERESTART) error = EINTR; if (error == EIO || error == EINTR) return (error); (void) tsleep(&lbolt, PSOCK, "nfscon", 0); } /* * Loop through outstanding request list and fix up all requests * on old socket. */ TAILQ_FOREACH(rp, &nfs_reqq, r_chain) { if (rp->r_nmp == nmp) rp->r_flags |= R_MUSTRESEND; } return (0); } /* * NFS disconnect. Clean up and unlink. */ void nfs_disconnect(struct nfsmount *nmp) { struct socket *so; NET_ASSERT_GIANT(); if (nmp->nm_so) { so = nmp->nm_so; nmp->nm_so = NULL; soshutdown(so, SHUT_RDWR); soclose(so); } } void nfs_safedisconnect(struct nfsmount *nmp) { struct nfsreq dummyreq; bzero(&dummyreq, sizeof(dummyreq)); dummyreq.r_nmp = nmp; nfs_rcvlock(&dummyreq); nfs_disconnect(nmp); nfs_rcvunlock(&dummyreq); } /* * This is the nfs send routine. For connection based socket types, it * must be called with an nfs_sndlock() on the socket. * - return EINTR if the RPC is terminated, 0 otherwise * - set R_MUSTRESEND if the send fails for any reason * - do any cleanup required by recoverable socket errors (?) */ int nfs_send(struct socket *so, struct sockaddr *nam, struct mbuf *top, struct nfsreq *rep) { struct sockaddr *sendnam; int error, error2, soflags, flags; NET_ASSERT_GIANT(); KASSERT(rep, ("nfs_send: called with rep == NULL")); error = nfs_sigintr(rep->r_nmp, rep, rep->r_td); if (error) { m_freem(top); return (error); } if ((so = rep->r_nmp->nm_so) == NULL) { rep->r_flags |= R_MUSTRESEND; m_freem(top); return (0); } rep->r_flags &= ~R_MUSTRESEND; soflags = rep->r_nmp->nm_soflags; if ((soflags & PR_CONNREQUIRED) || (so->so_state & SS_ISCONNECTED)) sendnam = NULL; else sendnam = nam; if (so->so_type == SOCK_SEQPACKET) flags = MSG_EOR; else flags = 0; error = so->so_proto->pr_usrreqs->pru_sosend(so, sendnam, 0, top, 0, flags, curthread /*XXX*/); if (error == ENOBUFS && so->so_type == SOCK_DGRAM) { error = 0; rep->r_flags |= R_MUSTRESEND; } if (error) { /* * Don't report EPIPE errors on nfs sockets. * These can be due to idle tcp mounts which will be closed by * netapp, solaris, etc. if left idle too long. */ if (error != EPIPE) { log(LOG_INFO, "nfs send error %d for server %s\n", error, rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname); } /* * Deal with errors for the client side. */ error2 = NFS_SIGREP(rep); if (error2) error = error2; else rep->r_flags |= R_MUSTRESEND; /* * Handle any recoverable (soft) socket errors here. (?) */ if (error != EINTR && error != ERESTART && error != EIO && error != EWOULDBLOCK && error != EPIPE) error = 0; } return (error); } /* * Receive a Sun RPC Request/Reply. For SOCK_DGRAM, the work is all * done by soreceive(), but for SOCK_STREAM we must deal with the Record * Mark and consolidate the data into a new mbuf list. * nb: Sometimes TCP passes the data up to soreceive() in long lists of * small mbufs. * For SOCK_STREAM we must be very careful to read an entire record once * we have read any of it, even if the system call has been interrupted. */ static int nfs_receive(struct nfsreq *rep, struct sockaddr **aname, struct mbuf **mp) { struct socket *so; struct uio auio; struct iovec aio; struct mbuf *m; struct mbuf *control; u_int32_t len; struct sockaddr **getnam; int error, error2, sotype, rcvflg; struct thread *td = curthread; /* XXX */ NET_ASSERT_GIANT(); /* * Set up arguments for soreceive() */ *mp = NULL; *aname = NULL; sotype = rep->r_nmp->nm_sotype; /* * For reliable protocols, lock against other senders/receivers * in case a reconnect is necessary. * For SOCK_STREAM, first get the Record Mark to find out how much * more there is to get. * We must lock the socket against other receivers * until we have an entire rpc request/reply. */ if (sotype != SOCK_DGRAM) { error = nfs_sndlock(rep); if (error) return (error); tryagain: /* * Check for fatal errors and resending request. */ /* * Ugh: If a reconnect attempt just happened, nm_so * would have changed. NULL indicates a failed * attempt that has essentially shut down this * mount point. */ if (rep->r_mrep || (error = NFS_SIGREP(rep)) != 0) { nfs_sndunlock(rep); return (error == 0 ? EINTR : error); } so = rep->r_nmp->nm_so; if (!so) { error = nfs_reconnect(rep); if (error) { nfs_sndunlock(rep); return (error); } goto tryagain; } while (rep->r_flags & R_MUSTRESEND) { m = m_copym(rep->r_mreq, 0, M_COPYALL, M_TRYWAIT); nfsstats.rpcretries++; error = nfs_send(so, rep->r_nmp->nm_nam, m, rep); if (error) { if (error == EINTR || error == ERESTART || error == EIO || (error = nfs_reconnect(rep)) != 0) { nfs_sndunlock(rep); return (error); } goto tryagain; } } nfs_sndunlock(rep); if (sotype == SOCK_STREAM) { aio.iov_base = (caddr_t) &len; aio.iov_len = sizeof(u_int32_t); auio.uio_iov = &aio; auio.uio_iovcnt = 1; auio.uio_segflg = UIO_SYSSPACE; auio.uio_rw = UIO_READ; auio.uio_offset = 0; auio.uio_resid = sizeof(u_int32_t); auio.uio_td = td; do { rcvflg = MSG_WAITALL; error = so->so_proto->pr_usrreqs->pru_soreceive (so, NULL, &auio, NULL, NULL, &rcvflg); if (error == EWOULDBLOCK) { error2 = NFS_SIGREP(rep); if (error2) return (error2); } } while (0); if (!error && auio.uio_resid > 0) { /* * Don't log a 0 byte receive; it means * that the socket has been closed, and * can happen during normal operation * (forcible unmount or Solaris server). */ if (auio.uio_resid != sizeof (u_int32_t)) log(LOG_INFO, "short receive (%d/%d) from nfs server %s\n", (int)(sizeof(u_int32_t) - auio.uio_resid), (int)sizeof(u_int32_t), rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname); error = EPIPE; } if (error) goto errout; len = ntohl(len) & ~0x80000000; /* * This is SERIOUS! We are out of sync with the sender * and forcing a disconnect/reconnect is all I can do. */ if (len > NFS_MAXPACKET) { log(LOG_ERR, "%s (%d) from nfs server %s\n", "impossible packet length", len, rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname); error = EFBIG; goto errout; } auio.uio_resid = len; do { rcvflg = MSG_WAITALL; error = so->so_proto->pr_usrreqs->pru_soreceive (so, NULL, &auio, mp, NULL, &rcvflg); } while (0); if (!error && auio.uio_resid > 0) { if (len != auio.uio_resid) log(LOG_INFO, "short receive (%d/%d) from nfs server %s\n", len - auio.uio_resid, len, rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname); error = EPIPE; } } else { /* * NB: Since uio_resid is big, MSG_WAITALL is ignored * and soreceive() will return when it has either a * control msg or a data msg. * We have no use for control msg., but must grab them * and then throw them away so we know what is going * on. */ auio.uio_resid = len = 100000000; /* Anything Big */ auio.uio_td = td; do { rcvflg = 0; error = so->so_proto->pr_usrreqs->pru_soreceive (so, NULL, &auio, mp, &control, &rcvflg); if (control) m_freem(control); if (error == EWOULDBLOCK && rep) { error2 = NFS_SIGREP(rep); if (error2) return (error2); } } while (!error && *mp == NULL && control); if ((rcvflg & MSG_EOR) == 0) printf("Egad!!\n"); if (!error && *mp == NULL) error = EPIPE; len -= auio.uio_resid; } errout: if (error && error != EINTR && error != EIO && error != ERESTART) { m_freem(*mp); *mp = NULL; if (error != EPIPE && error != EWOULDBLOCK) log(LOG_INFO, "receive error %d from nfs server %s\n", error, rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname); error = nfs_sndlock(rep); if (!error) { error = nfs_reconnect(rep); if (!error) goto tryagain; else nfs_sndunlock(rep); } } } else { /* * We may have failed while rebinding the datagram socket * so attempt a rebind here. */ if ((so = rep->r_nmp->nm_so) == NULL) { error = nfs_sndlock(rep); if (!error) { error = nfs_reconnect(rep); nfs_sndunlock(rep); } if (error) return (error); so = rep->r_nmp->nm_so; } if (so->so_state & SS_ISCONNECTED) getnam = NULL; else getnam = aname; auio.uio_resid = len = 1000000; auio.uio_td = td; do { rcvflg = 0; error = so->so_proto->pr_usrreqs->pru_soreceive (so, getnam, &auio, mp, NULL, &rcvflg); if (error) { error2 = NFS_SIGREP(rep); if (error2) { error = error2; goto dgramout; } } if (error) { error2 = nfs_sndlock(rep); if (!error2) { error2 = nfs_reconnect(rep); if (error2) error = error2; else so = rep->r_nmp->nm_so; nfs_sndunlock(rep); } else { error = error2; } } } while (error == EWOULDBLOCK); dgramout: len -= auio.uio_resid; } if (error) { m_freem(*mp); *mp = NULL; } /* * Search for any mbufs that are not a multiple of 4 bytes long * or with m_data not longword aligned. * These could cause pointer alignment problems, so copy them to * well aligned mbufs. */ nfs_realign(mp, 5 * NFSX_UNSIGNED); return (error); } /* * Implement receipt of reply on a socket. * We must search through the list of received datagrams matching them * with outstanding requests using the xid, until ours is found. */ /* ARGSUSED */ static int nfs_reply(struct nfsreq *myrep) { struct nfsreq *rep; struct nfsmount *nmp = myrep->r_nmp; int32_t t1; struct mbuf *mrep, *md; struct sockaddr *nam; u_int32_t rxid, *tl; caddr_t dpos; int error; /* * Loop around until we get our own reply */ for (;;) { /* * Lock against other receivers so that I don't get stuck in * sbwait() after someone else has received my reply for me. * Also necessary for connection based protocols to avoid * race conditions during a reconnect. * If nfs_rcvlock() returns EALREADY, that means that * the reply has already been recieved by another * process and we can return immediately. In this * case, the lock is not taken to avoid races with * other processes. */ error = nfs_rcvlock(myrep); if (error == EALREADY) return (0); if (error) return (error); /* * Get the next Rpc reply off the socket */ error = nfs_receive(myrep, &nam, &mrep); nfs_rcvunlock(myrep); if (error) { /* * Ignore routing errors on connectionless protocols?? */ if (NFSIGNORE_SOERROR(nmp->nm_soflags, error)) { nmp->nm_so->so_error = 0; if (myrep->r_flags & R_GETONEREP) return (0); continue; } return (error); } if (nam) FREE(nam, M_SONAME); /* * Get the xid and check that it is an rpc reply */ md = mrep; dpos = mtod(md, caddr_t); tl = nfsm_dissect(u_int32_t *, 2 * NFSX_UNSIGNED); rxid = *tl++; if (*tl != rpc_reply) { nfsstats.rpcinvalid++; m_freem(mrep); nfsmout: if (myrep->r_flags & R_GETONEREP) return (0); continue; } /* * Loop through the request list to match up the reply * Iff no match, just drop the datagram */ TAILQ_FOREACH(rep, &nfs_reqq, r_chain) { if (rep->r_mrep == NULL && rxid == rep->r_xid) { /* Found it.. */ rep->r_mrep = mrep; rep->r_md = md; rep->r_dpos = dpos; /* * Update congestion window. * Do the additive increase of * one rpc/rtt. */ if (nmp->nm_cwnd <= nmp->nm_sent) { nmp->nm_cwnd += (NFS_CWNDSCALE * NFS_CWNDSCALE + (nmp->nm_cwnd >> 1)) / nmp->nm_cwnd; if (nmp->nm_cwnd > NFS_MAXCWND) nmp->nm_cwnd = NFS_MAXCWND; } if (rep->r_flags & R_SENT) { rep->r_flags &= ~R_SENT; nmp->nm_sent -= NFS_CWNDSCALE; } /* * Update rtt using a gain of 0.125 on the mean * and a gain of 0.25 on the deviation. */ if (rep->r_flags & R_TIMING) { /* * Since the timer resolution of * NFS_HZ is so course, it can often * result in r_rtt == 0. Since * r_rtt == N means that the actual * rtt is between N+dt and N+2-dt ticks, * add 1. */ t1 = rep->r_rtt + 1; t1 -= (NFS_SRTT(rep) >> 3); NFS_SRTT(rep) += t1; if (t1 < 0) t1 = -t1; t1 -= (NFS_SDRTT(rep) >> 2); NFS_SDRTT(rep) += t1; } nmp->nm_timeouts = 0; break; } } /* * If not matched to a request, drop it. * If it's mine, get out. */ if (rep == 0) { nfsstats.rpcunexpected++; m_freem(mrep); } else if (rep == myrep) { if (rep->r_mrep == NULL) panic("nfsreply nil"); return (0); } if (myrep->r_flags & R_GETONEREP) return (0); } } /* * nfs_request - goes something like this * - fill in request struct * - links it into list * - calls nfs_send() for first transmit * - calls nfs_receive() to get reply * - break down rpc header and return with nfs reply pointed to * by mrep or error * nb: always frees up mreq mbuf list */ /* XXX overloaded before */ #define NQ_TRYLATERDEL 15 /* Initial try later delay (sec) */ int nfs_request(struct vnode *vp, struct mbuf *mrest, int procnum, struct thread *td, struct ucred *cred, struct mbuf **mrp, struct mbuf **mdp, caddr_t *dposp) { struct mbuf *mrep, *m2; struct nfsreq *rep; u_int32_t *tl; int i; struct nfsmount *nmp; struct mbuf *m, *md, *mheadend; time_t waituntil; caddr_t dpos; int s, error = 0, mrest_len, auth_len, auth_type; int trylater_delay = NQ_TRYLATERDEL, trylater_cnt = 0; struct timeval now; u_int32_t xid; /* Reject requests while attempting a forced unmount. */ if (vp->v_mount->mnt_kern_flag & MNTK_UNMOUNTF) { m_freem(mrest); return (ESTALE); } nmp = VFSTONFS(vp->v_mount); if ((nmp->nm_flag & NFSMNT_NFSV4) != 0) return nfs4_request(vp, mrest, procnum, td, cred, mrp, mdp, dposp); MALLOC(rep, struct nfsreq *, sizeof(struct nfsreq), M_NFSREQ, M_WAITOK); rep->r_nmp = nmp; rep->r_vp = vp; rep->r_td = td; rep->r_procnum = procnum; getmicrouptime(&now); rep->r_lastmsg = now.tv_sec - ((nmp->nm_tprintf_delay) - (nmp->nm_tprintf_initial_delay)); mrest_len = m_length(mrest, NULL); /* * Get the RPC header with authorization. */ auth_type = RPCAUTH_UNIX; if (cred->cr_ngroups < 1) panic("nfsreq nogrps"); auth_len = ((((cred->cr_ngroups - 1) > nmp->nm_numgrps) ? nmp->nm_numgrps : (cred->cr_ngroups - 1)) << 2) + 5 * NFSX_UNSIGNED; m = nfsm_rpchead(cred, nmp->nm_flag, procnum, auth_type, auth_len, mrest, mrest_len, &mheadend, &xid); /* * For stream protocols, insert a Sun RPC Record Mark. */ if (nmp->nm_sotype == SOCK_STREAM) { M_PREPEND(m, NFSX_UNSIGNED, M_TRYWAIT); *mtod(m, u_int32_t *) = htonl(0x80000000 | (m->m_pkthdr.len - NFSX_UNSIGNED)); } rep->r_mreq = m; rep->r_xid = xid; tryagain: if (nmp->nm_flag & NFSMNT_SOFT) rep->r_retry = nmp->nm_retry; else rep->r_retry = NFS_MAXREXMIT + 1; /* past clip limit */ rep->r_rtt = rep->r_rexmit = 0; if (proct[procnum] > 0) rep->r_flags = R_TIMING; else rep->r_flags = 0; rep->r_mrep = NULL; /* * Do the client side RPC. */ nfsstats.rpcrequests++; /* * Chain request into list of outstanding requests. Be sure * to put it LAST so timer finds oldest requests first. */ s = splsoftclock(); if (TAILQ_EMPTY(&nfs_reqq)) callout_reset(&nfs_callout, nfs_ticks, nfs_timer, NULL); TAILQ_INSERT_TAIL(&nfs_reqq, rep, r_chain); /* * If backing off another request or avoiding congestion, don't * send this one now but let timer do it. If not timing a request, * do it now. */ if (nmp->nm_so && (nmp->nm_sotype != SOCK_DGRAM || (nmp->nm_flag & NFSMNT_DUMBTIMR) || nmp->nm_sent < nmp->nm_cwnd)) { splx(s); error = nfs_sndlock(rep); if (!error) { m2 = m_copym(m, 0, M_COPYALL, M_TRYWAIT); error = nfs_send(nmp->nm_so, nmp->nm_nam, m2, rep); nfs_sndunlock(rep); } if (!error && (rep->r_flags & R_MUSTRESEND) == 0) { nmp->nm_sent += NFS_CWNDSCALE; rep->r_flags |= R_SENT; } } else { splx(s); rep->r_rtt = -1; } /* * Wait for the reply from our send or the timer's. */ if (!error || error == EPIPE) error = nfs_reply(rep); /* * RPC done, unlink the request. */ s = splsoftclock(); TAILQ_REMOVE(&nfs_reqq, rep, r_chain); if (TAILQ_EMPTY(&nfs_reqq)) callout_stop(&nfs_callout); splx(s); /* * Decrement the outstanding request count. */ if (rep->r_flags & R_SENT) { rep->r_flags &= ~R_SENT; /* paranoia */ nmp->nm_sent -= NFS_CWNDSCALE; } /* * If there was a successful reply and a tprintf msg. * tprintf a response. */ if (!error) nfs_up(rep, nmp, rep->r_td, "is alive again", NFSSTA_TIMEO); mrep = rep->r_mrep; md = rep->r_md; dpos = rep->r_dpos; if (error) { m_freem(rep->r_mreq); free((caddr_t)rep, M_NFSREQ); return (error); } /* * break down the rpc header and check if ok */ tl = nfsm_dissect(u_int32_t *, 3 * NFSX_UNSIGNED); if (*tl++ == rpc_msgdenied) { if (*tl == rpc_mismatch) error = EOPNOTSUPP; else error = EACCES; m_freem(mrep); m_freem(rep->r_mreq); free((caddr_t)rep, M_NFSREQ); return (error); } /* * Just throw away any verifyer (ie: kerberos etc). */ i = fxdr_unsigned(int, *tl++); /* verf type */ i = fxdr_unsigned(int32_t, *tl); /* len */ if (i > 0) nfsm_adv(nfsm_rndup(i)); tl = nfsm_dissect(u_int32_t *, NFSX_UNSIGNED); /* 0 == ok */ if (*tl == 0) { tl = nfsm_dissect(u_int32_t *, NFSX_UNSIGNED); if (*tl != 0) { error = fxdr_unsigned(int, *tl); if ((nmp->nm_flag & NFSMNT_NFSV3) && error == NFSERR_TRYLATER) { m_freem(mrep); error = 0; waituntil = time_second + trylater_delay; while (time_second < waituntil) (void) tsleep(&lbolt, PSOCK, "nqnfstry", 0); trylater_delay *= nfs_backoff[trylater_cnt]; if (trylater_cnt < NFS_NBACKOFF - 1) trylater_cnt++; goto tryagain; } /* * If the File Handle was stale, invalidate the * lookup cache, just in case. */ if (error == ESTALE) cache_purge(vp); if (nmp->nm_flag & NFSMNT_NFSV3) { *mrp = mrep; *mdp = md; *dposp = dpos; error |= NFSERR_RETERR; } else m_freem(mrep); m_freem(rep->r_mreq); free((caddr_t)rep, M_NFSREQ); return (error); } *mrp = mrep; *mdp = md; *dposp = dpos; m_freem(rep->r_mreq); FREE((caddr_t)rep, M_NFSREQ); return (0); } m_freem(mrep); error = EPROTONOSUPPORT; nfsmout: m_freem(rep->r_mreq); free((caddr_t)rep, M_NFSREQ); return (error); } /* * Nfs timer routine * Scan the nfsreq list and retranmit any requests that have timed out * To avoid retransmission attempts on STREAM sockets (in the future) make * sure to set the r_retry field to 0 (implies nm_retry == 0). */ void nfs_timer(void *arg) { struct nfsreq *rep; struct mbuf *m; struct socket *so; struct nfsmount *nmp; int timeo; int s, error; struct thread *td; struct timeval now; getmicrouptime(&now); td = &thread0; /* XXX for credentials, may break if sleep */ s = splnet(); TAILQ_FOREACH(rep, &nfs_reqq, r_chain) { nmp = rep->r_nmp; if (rep->r_mrep || (rep->r_flags & R_SOFTTERM)) continue; if (nfs_sigintr(nmp, rep, rep->r_td)) continue; if (nmp->nm_tprintf_initial_delay != 0 && (rep->r_rexmit > 2 || (rep->r_flags & R_RESENDERR)) && rep->r_lastmsg + nmp->nm_tprintf_delay < now.tv_sec) { rep->r_lastmsg = now.tv_sec; nfs_down(rep, nmp, rep->r_td, "not responding", 0, NFSSTA_TIMEO); #if 0 if (!(nmp->nm_state & NFSSTA_MOUNTED)) { /* we're not yet completely mounted and */ /* we can't complete an RPC, so we fail */ nfsstats.rpctimeouts++; nfs_softterm(rep); continue; } #endif } if (rep->r_rtt >= 0) { rep->r_rtt++; if (nmp->nm_flag & NFSMNT_DUMBTIMR) timeo = nmp->nm_timeo; else timeo = NFS_RTO(nmp, proct[rep->r_procnum]); if (nmp->nm_timeouts > 0) timeo *= nfs_backoff[nmp->nm_timeouts - 1]; if (rep->r_rtt <= timeo) continue; if (nmp->nm_timeouts < NFS_NBACKOFF) nmp->nm_timeouts++; } if (rep->r_rexmit >= rep->r_retry) { /* too many */ nfsstats.rpctimeouts++; nfs_softterm(rep); continue; } if (nmp->nm_sotype != SOCK_DGRAM) { if (++rep->r_rexmit > NFS_MAXREXMIT) rep->r_rexmit = NFS_MAXREXMIT; continue; } if ((so = nmp->nm_so) == NULL) continue; /* * If there is enough space and the window allows.. * Resend it * Set r_rtt to -1 in case we fail to send it now. */ rep->r_rtt = -1; if (sbspace(&so->so_snd) >= rep->r_mreq->m_pkthdr.len && ((nmp->nm_flag & NFSMNT_DUMBTIMR) || (rep->r_flags & R_SENT) || nmp->nm_sent < nmp->nm_cwnd) && (m = m_copym(rep->r_mreq, 0, M_COPYALL, M_DONTWAIT))){ if ((nmp->nm_flag & NFSMNT_NOCONN) == 0) error = (*so->so_proto->pr_usrreqs->pru_send) (so, 0, m, NULL, NULL, td); else error = (*so->so_proto->pr_usrreqs->pru_send) (so, 0, m, nmp->nm_nam, NULL, td); if (error) { if (NFSIGNORE_SOERROR(nmp->nm_soflags, error)) so->so_error = 0; rep->r_flags |= R_RESENDERR; } else { /* * Iff first send, start timing * else turn timing off, backoff timer * and divide congestion window by 2. */ rep->r_flags &= ~R_RESENDERR; if (rep->r_flags & R_SENT) { rep->r_flags &= ~R_TIMING; if (++rep->r_rexmit > NFS_MAXREXMIT) rep->r_rexmit = NFS_MAXREXMIT; nmp->nm_cwnd >>= 1; if (nmp->nm_cwnd < NFS_CWNDSCALE) nmp->nm_cwnd = NFS_CWNDSCALE; nfsstats.rpcretries++; } else { rep->r_flags |= R_SENT; nmp->nm_sent += NFS_CWNDSCALE; } rep->r_rtt = 0; } } } splx(s); callout_reset(&nfs_callout, nfs_ticks, nfs_timer, NULL); } /* * Mark all of an nfs mount's outstanding requests with R_SOFTTERM and * wait for all requests to complete. This is used by forced unmounts * to terminate any outstanding RPCs. */ int nfs_nmcancelreqs(nmp) struct nfsmount *nmp; { struct nfsreq *req; int i, s; s = splnet(); TAILQ_FOREACH(req, &nfs_reqq, r_chain) { if (nmp != req->r_nmp || req->r_mrep != NULL || (req->r_flags & R_SOFTTERM)) continue; nfs_softterm(req); } splx(s); for (i = 0; i < 30; i++) { s = splnet(); TAILQ_FOREACH(req, &nfs_reqq, r_chain) { if (nmp == req->r_nmp) break; } splx(s); if (req == NULL) return (0); tsleep(&lbolt, PSOCK, "nfscancel", 0); } return (EBUSY); } /* * Flag a request as being about to terminate (due to NFSMNT_INT/NFSMNT_SOFT). * The nm_send count is decremented now to avoid deadlocks when the process in * soreceive() hasn't yet managed to send its own request. */ static void nfs_softterm(struct nfsreq *rep) { rep->r_flags |= R_SOFTTERM; if (rep->r_flags & R_SENT) { rep->r_nmp->nm_sent -= NFS_CWNDSCALE; rep->r_flags &= ~R_SENT; } } /* * Test for a termination condition pending on the process. * This is used for NFSMNT_INT mounts. */ int nfs_sigintr(struct nfsmount *nmp, struct nfsreq *rep, struct thread *td) { struct proc *p; sigset_t tmpset; if ((nmp->nm_flag & NFSMNT_NFSV4) != 0) return nfs4_sigintr(nmp, rep, td); if (rep && (rep->r_flags & R_SOFTTERM)) return (EIO); /* Terminate all requests while attempting a forced unmount. */ if (nmp->nm_mountp->mnt_kern_flag & MNTK_UNMOUNTF) return (EIO); if (!(nmp->nm_flag & NFSMNT_INT)) return (0); if (td == NULL) return (0); p = td->td_proc; PROC_LOCK(p); tmpset = p->p_siglist; SIGSETNAND(tmpset, td->td_sigmask); mtx_lock(&p->p_sigacts->ps_mtx); SIGSETNAND(tmpset, p->p_sigacts->ps_sigignore); mtx_unlock(&p->p_sigacts->ps_mtx); if (SIGNOTEMPTY(p->p_siglist) && NFSINT_SIGMASK(tmpset)) { PROC_UNLOCK(p); return (EINTR); } PROC_UNLOCK(p); return (0); } /* * Lock a socket against others. * Necessary for STREAM sockets to ensure you get an entire rpc request/reply * and also to avoid race conditions between the processes with nfs requests * in progress when a reconnect is necessary. */ int nfs_sndlock(struct nfsreq *rep) { int *statep = &rep->r_nmp->nm_state; struct thread *td; int error, slpflag = 0, slptimeo = 0; td = rep->r_td; if (rep->r_nmp->nm_flag & NFSMNT_INT) slpflag = PCATCH; while (*statep & NFSSTA_SNDLOCK) { error = nfs_sigintr(rep->r_nmp, rep, td); if (error) return (error); *statep |= NFSSTA_WANTSND; (void) tsleep(statep, slpflag | (PZERO - 1), "nfsndlck", slptimeo); if (slpflag == PCATCH) { slpflag = 0; slptimeo = 2 * hz; } } *statep |= NFSSTA_SNDLOCK; return (0); } /* * Unlock the stream socket for others. */ void nfs_sndunlock(struct nfsreq *rep) { int *statep = &rep->r_nmp->nm_state; if ((*statep & NFSSTA_SNDLOCK) == 0) panic("nfs sndunlock"); *statep &= ~NFSSTA_SNDLOCK; if (*statep & NFSSTA_WANTSND) { *statep &= ~NFSSTA_WANTSND; wakeup(statep); } } static int nfs_rcvlock(struct nfsreq *rep) { int *statep = &rep->r_nmp->nm_state; int error, slpflag, slptimeo = 0; if (rep->r_nmp->nm_flag & NFSMNT_INT) slpflag = PCATCH; else slpflag = 0; while (*statep & NFSSTA_RCVLOCK) { error = nfs_sigintr(rep->r_nmp, rep, rep->r_td); if (error) return (error); *statep |= NFSSTA_WANTRCV; (void) tsleep(statep, slpflag | (PZERO - 1), "nfsrcvlk", slptimeo); /* * If our reply was recieved while we were sleeping, * then just return without taking the lock to avoid a * situation where a single iod could 'capture' the * recieve lock. */ if (rep->r_mrep != NULL) return (EALREADY); if (slpflag == PCATCH) { slpflag = 0; slptimeo = 2 * hz; } } /* Always fail if our request has been cancelled. */ if (rep != NULL && (error = NFS_SIGREP(rep)) != 0) return (error); *statep |= NFSSTA_RCVLOCK; return (0); } /* * Unlock the stream socket for others. */ static void nfs_rcvunlock(struct nfsreq *rep) { int *statep = &rep->r_nmp->nm_state; if ((*statep & NFSSTA_RCVLOCK) == 0) panic("nfs rcvunlock"); *statep &= ~NFSSTA_RCVLOCK; if (*statep & NFSSTA_WANTRCV) { *statep &= ~NFSSTA_WANTRCV; wakeup(statep); } } /* * nfs_realign: * * Check for badly aligned mbuf data and realign by copying the unaligned * portion of the data into a new mbuf chain and freeing the portions * of the old chain that were replaced. * * We cannot simply realign the data within the existing mbuf chain * because the underlying buffers may contain other rpc commands and * we cannot afford to overwrite them. * * We would prefer to avoid this situation entirely. The situation does * not occur with NFS/UDP and is supposed to only occassionally occur * with TCP. Use vfs.nfs.realign_count and realign_test to check this. */ static void nfs_realign(struct mbuf **pm, int hsiz) { struct mbuf *m; struct mbuf *n = NULL; int off = 0; ++nfs_realign_test; while ((m = *pm) != NULL) { if ((m->m_len & 0x3) || (mtod(m, intptr_t) & 0x3)) { MGET(n, M_TRYWAIT, MT_DATA); if (m->m_len >= MINCLSIZE) { MCLGET(n, M_TRYWAIT); } n->m_len = 0; break; } pm = &m->m_next; } /* * If n is non-NULL, loop on m copying data, then replace the * portion of the chain that had to be realigned. */ if (n != NULL) { ++nfs_realign_count; while (m) { m_copyback(n, off, m->m_len, mtod(m, caddr_t)); off += m->m_len; m = m->m_next; } m_freem(*pm); *pm = n; } } static int nfs_msg(struct thread *td, const char *server, const char *msg, int error) { struct proc *p; p = td ? td->td_proc : NULL; if (error) { tprintf(p, LOG_INFO, "nfs server %s: %s, error %d\n", server, msg, error); } else { tprintf(p, LOG_INFO, "nfs server %s: %s\n", server, msg); } return (0); } void nfs_down(rep, nmp, td, msg, error, flags) struct nfsreq *rep; struct nfsmount *nmp; struct thread *td; const char *msg; int error, flags; { if (nmp == NULL) return; if ((flags & NFSSTA_TIMEO) && !(nmp->nm_state & NFSSTA_TIMEO)) { vfs_event_signal(&nmp->nm_mountp->mnt_stat.f_fsid, VQ_NOTRESP, 0); nmp->nm_state |= NFSSTA_TIMEO; } #ifdef NFSSTA_LOCKTIMEO if ((flags & NFSSTA_LOCKTIMEO) && !(nmp->nm_state & NFSSTA_LOCKTIMEO)) { vfs_event_signal(&nmp->nm_mountp->mnt_stat.f_fsid, VQ_NOTRESPLOCK, 0); nmp->nm_state |= NFSSTA_LOCKTIMEO; } #endif if (rep) rep->r_flags |= R_TPRINTFMSG; nfs_msg(td, nmp->nm_mountp->mnt_stat.f_mntfromname, msg, error); } void nfs_up(rep, nmp, td, msg, flags) struct nfsreq *rep; struct nfsmount *nmp; struct thread *td; const char *msg; int flags; { if (nmp == NULL) return; if ((rep == NULL) || (rep->r_flags & R_TPRINTFMSG) != 0) nfs_msg(td, nmp->nm_mountp->mnt_stat.f_mntfromname, msg, 0); if ((flags & NFSSTA_TIMEO) && (nmp->nm_state & NFSSTA_TIMEO)) { nmp->nm_state &= ~NFSSTA_TIMEO; vfs_event_signal(&nmp->nm_mountp->mnt_stat.f_fsid, VQ_NOTRESP, 1); } #ifdef NFSSTA_LOCKTIMEO if ((flags & NFSSTA_LOCKTIMEO) && (nmp->nm_state & NFSSTA_LOCKTIMEO)) { nmp->nm_state &= ~NFSSTA_LOCKTIMEO; vfs_event_signal(&nmp->nm_mountp->mnt_stat.f_fsid, VQ_NOTRESPLOCK, 1); } #endif } Index: head/sys/sys/socketvar.h =================================================================== --- head/sys/sys/socketvar.h (revision 132017) +++ head/sys/sys/socketvar.h (revision 132018) @@ -1,554 +1,556 @@ /*- * Copyright (c) 1982, 1986, 1990, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)socketvar.h 8.3 (Berkeley) 2/19/95 * $FreeBSD$ */ #ifndef _SYS_SOCKETVAR_H_ #define _SYS_SOCKETVAR_H_ #include /* for TAILQ macros */ #include /* for struct selinfo */ #include #include /* * Kernel structure per socket. * Contains send and receive buffer queues, * handle on protocol and pointer to protocol * private data and error information. */ typedef u_quad_t so_gen_t; /*- * Locking key to struct socket: * (a) constant after allocation, no locking required. * (b) locked by SOCK_LOCK(so). * (c) locked by SOCKBUF_LOCK(&so->so_rcv). * (d) locked by SOCKBUF_LOCK(&so->so_snd). * (e) locked by ACCEPT_LOCK(). * (f) not locked since integer reads/writes are atomic. * (g) used only as a sleep/wakeup address, no value. * (h) locked by global mutex so_global_mtx. */ struct socket { int so_count; /* (b) reference count */ short so_type; /* (a) generic type, see socket.h */ short so_options; /* from socket call, see socket.h */ short so_linger; /* time to linger while closing */ short so_state; /* (b) internal state flags SS_* */ int so_qstate; /* (e) internal state flags SQ_* */ void *so_pcb; /* protocol control block */ struct protosw *so_proto; /* (a) protocol handle */ /* * Variables for connection queuing. * Socket where accepts occur is so_head in all subsidiary sockets. * If so_head is 0, socket is not related to an accept. * For head socket so_incomp queues partially completed connections, * while so_comp is a queue of connections ready to be accepted. * If a connection is aborted and it has so_head set, then * it has to be pulled out of either so_incomp or so_comp. * We allow connections to queue up based on current queue lengths * and limit on number of queued connections for this socket. */ struct socket *so_head; /* (e) back pointer to accept socket */ TAILQ_HEAD(, socket) so_incomp; /* (e) queue of partial unaccepted connections */ TAILQ_HEAD(, socket) so_comp; /* (e) queue of complete unaccepted connections */ TAILQ_ENTRY(socket) so_list; /* (e) list of unaccepted connections */ short so_qlen; /* (e) number of unaccepted connections */ short so_incqlen; /* (e) number of unaccepted incomplete connections */ short so_qlimit; /* (e) max number queued connections */ short so_timeo; /* (g) connection timeout */ u_short so_error; /* (f) error affecting connection */ struct sigio *so_sigio; /* [sg] information for async I/O or out of band data (SIGURG) */ u_long so_oobmark; /* (c) chars to oob mark */ TAILQ_HEAD(, aiocblist) so_aiojobq; /* AIO ops waiting on socket */ /* * Variables for socket buffering. */ struct sockbuf { struct selinfo sb_sel; /* process selecting read/write */ struct mtx sb_mtx; /* sockbuf lock */ #define sb_startzero sb_mb struct mbuf *sb_mb; /* (c/d) the mbuf chain */ struct mbuf *sb_mbtail; /* (c/d) the last mbuf in the chain */ struct mbuf *sb_lastrecord; /* (c/d) first mbuf of last * record in socket buffer */ u_int sb_cc; /* (c/d) actual chars in buffer */ u_int sb_hiwat; /* (c/d) max actual char count */ u_int sb_mbcnt; /* (c/d) chars of mbufs used */ u_int sb_mbmax; /* (c/d) max chars of mbufs to use */ u_int sb_ctl; /* (c/d) non-data chars in buffer */ int sb_lowat; /* (c/d) low water mark */ int sb_timeo; /* (c/d) timeout for read/write */ short sb_flags; /* (c/d) flags, see below */ short sb_state; /* (c/d) socket state on sockbuf */ } so_rcv, so_snd; /* * Constants for sb_flags field of struct sockbuf. */ #define SB_MAX (256*1024) /* default for max chars in sockbuf */ /* * Constants for sb_flags field of struct sockbuf. */ #define SB_LOCK 0x01 /* lock on data queue */ #define SB_WANT 0x02 /* someone is waiting to lock */ #define SB_WAIT 0x04 /* someone is waiting for data/space */ #define SB_SEL 0x08 /* someone is selecting */ #define SB_ASYNC 0x10 /* ASYNC I/O, need signals */ #define SB_UPCALL 0x20 /* someone wants an upcall */ #define SB_NOINTR 0x40 /* operations not interruptible */ #define SB_AIO 0x80 /* AIO operations queued */ #define SB_KNOTE 0x100 /* kernel note attached */ void (*so_upcall)(struct socket *, void *, int); void *so_upcallarg; struct ucred *so_cred; /* (a) user credentials */ struct label *so_label; /* (b) MAC label for socket */ struct label *so_peerlabel; /* (b) cached MAC label for peer */ /* NB: generation count must not be first; easiest to make it last. */ so_gen_t so_gencnt; /* (h) generation count */ void *so_emuldata; /* private data for emulators */ struct so_accf { struct accept_filter *so_accept_filter; void *so_accept_filter_arg; /* saved filter args */ char *so_accept_filter_str; /* saved user args */ } *so_accf; }; #define SB_EMPTY_FIXUP(sb) do { \ if ((sb)->sb_mb == NULL) { \ (sb)->sb_mbtail = NULL; \ (sb)->sb_lastrecord = NULL; \ } \ } while (/*CONSTCOND*/0) /* * Global accept mutex to serialize access to accept queues and * fields associated with multiple sockets. This allows us to * avoid defining a lock order between listen and accept sockets * until such time as it proves to be a good idea. */ extern struct mtx accept_mtx; #define ACCEPT_LOCK() mtx_lock(&accept_mtx) #define ACCEPT_UNLOCK() mtx_unlock(&accept_mtx) /* * Per-socket buffer mutex used to protect most fields in the socket * buffer. */ #define SOCKBUF_MTX(_sb) (&(_sb)->sb_mtx) #define SOCKBUF_LOCK_INIT(_sb, _name) \ mtx_init(SOCKBUF_MTX(_sb), _name, NULL, MTX_DEF) #define SOCKBUF_LOCK_DESTROY(_sb) mtx_destroy(SOCKBUF_MTX(_sb)) #define SOCKBUF_LOCK(_sb) mtx_lock(SOCKBUF_MTX(_sb)) #define SOCKBUF_OWNED(_sb) mtx_owned(SOCKBUF_MTX(_sb)) #define SOCKBUF_UNLOCK(_sb) mtx_unlock(SOCKBUF_MTX(_sb)) #define SOCKBUF_LOCK_ASSERT(_sb) mtx_assert(SOCKBUF_MTX(_sb), MA_OWNED) /* * Per-socket mutex: we reuse the receive socket buffer mutex for space * efficiency. This decision should probably be revisited as we optimize * locking for the socket code. */ #define SOCK_MTX(_so) SOCKBUF_MTX(&(_so)->so_rcv) #define SOCK_LOCK(_so) SOCKBUF_LOCK(&(_so)->so_rcv) #define SOCK_OWNED(_so) SOCKBUF_OWNED(&(_so)->so_rcv) #define SOCK_UNLOCK(_so) SOCKBUF_UNLOCK(&(_so)->so_rcv) #define SOCK_LOCK_ASSERT(_so) SOCKBUF_LOCK_ASSERT(&(_so)->so_rcv) /* * Socket state bits. * * Historically, this bits were all kept in the so_state field. For * locking reasons, they are now in multiple fields, as they are * locked differently. so_state maintains basic socket state protected * by the socket lock. so_qstate holds information about the socket * accept queues. Each socket buffer also has a state field holding * information relevant to that socket buffer (can't send, rcv). Many * fields will be read without locks to improve performance and avoid * lock order issues. However, this approach must be used with caution. */ #define SS_NOFDREF 0x0001 /* no file table ref any more */ #define SS_ISCONNECTED 0x0002 /* socket connected to a peer */ #define SS_ISCONNECTING 0x0004 /* in process of connecting to peer */ #define SS_ISDISCONNECTING 0x0008 /* in process of disconnecting */ #define SS_NBIO 0x0100 /* non-blocking ops */ #define SS_ASYNC 0x0200 /* async i/o notify */ #define SS_ISCONFIRMING 0x0400 /* deciding to accept connection req */ #define SS_ISDISCONNECTED 0x2000 /* socket disconnected from peer */ /* * Socket state bits now stored in the socket buffer state field. */ #define SBS_CANTSENDMORE 0x0010 /* can't send more data to peer */ #define SBS_CANTRCVMORE 0x0020 /* can't receive more data from peer */ #define SBS_RCVATMARK 0x0040 /* at mark on input */ /* * Socket state bits stored in so_qstate. */ #define SQ_INCOMP 0x0800 /* unaccepted, incomplete connection */ #define SQ_COMP 0x1000 /* unaccepted, complete connection */ /* * Externalized form of struct socket used by the sysctl(3) interface. */ struct xsocket { size_t xso_len; /* length of this structure */ struct socket *xso_so; /* makes a convenient handle sometimes */ short so_type; short so_options; short so_linger; short so_state; caddr_t so_pcb; /* another convenient handle */ int xso_protocol; int xso_family; short so_qlen; short so_incqlen; short so_qlimit; short so_timeo; u_short so_error; pid_t so_pgid; u_long so_oobmark; struct xsockbuf { u_int sb_cc; u_int sb_hiwat; u_int sb_mbcnt; u_int sb_mbmax; int sb_lowat; int sb_timeo; short sb_flags; } so_rcv, so_snd; uid_t so_uid; /* XXX */ }; #ifdef _KERNEL /* * Macros for sockets and socket buffering. */ /* * Do we need to notify the other side when I/O is possible? */ #define sb_notify(sb) (((sb)->sb_flags & (SB_WAIT | SB_SEL | SB_ASYNC | \ SB_UPCALL | SB_AIO | SB_KNOTE)) != 0) /* * How much space is there in a socket buffer (so->so_snd or so->so_rcv)? * This is problematical if the fields are unsigned, as the space might * still be negative (cc > hiwat or mbcnt > mbmax). Should detect * overflow and return 0. Should use "lmin" but it doesn't exist now. */ #define sbspace(sb) \ ((long) imin((int)((sb)->sb_hiwat - (sb)->sb_cc), \ (int)((sb)->sb_mbmax - (sb)->sb_mbcnt))) /* do we have to send all at once on a socket? */ #define sosendallatonce(so) \ ((so)->so_proto->pr_flags & PR_ATOMIC) /* can we read something from so? */ #define soreadable(so) \ ((so)->so_rcv.sb_cc >= (so)->so_rcv.sb_lowat || \ ((so)->so_rcv.sb_state & SBS_CANTRCVMORE) || \ !TAILQ_EMPTY(&(so)->so_comp) || (so)->so_error) /* can we write something to so? */ #define sowriteable(so) \ ((sbspace(&(so)->so_snd) >= (so)->so_snd.sb_lowat && \ (((so)->so_state&SS_ISCONNECTED) || \ ((so)->so_proto->pr_flags&PR_CONNREQUIRED)==0)) || \ ((so)->so_snd.sb_state & SBS_CANTSENDMORE) || \ (so)->so_error) /* adjust counters in sb reflecting allocation of m */ #define sballoc(sb, m) { \ (sb)->sb_cc += (m)->m_len; \ if ((m)->m_type != MT_DATA && (m)->m_type != MT_HEADER && \ (m)->m_type != MT_OOBDATA) \ (sb)->sb_ctl += (m)->m_len; \ (sb)->sb_mbcnt += MSIZE; \ if ((m)->m_flags & M_EXT) \ (sb)->sb_mbcnt += (m)->m_ext.ext_size; \ } /* adjust counters in sb reflecting freeing of m */ #define sbfree(sb, m) { \ (sb)->sb_cc -= (m)->m_len; \ if ((m)->m_type != MT_DATA && (m)->m_type != MT_HEADER && \ (m)->m_type != MT_OOBDATA) \ (sb)->sb_ctl -= (m)->m_len; \ (sb)->sb_mbcnt -= MSIZE; \ if ((m)->m_flags & M_EXT) \ (sb)->sb_mbcnt -= (m)->m_ext.ext_size; \ } /* * Set lock on sockbuf sb; sleep if lock is already held. * Unless SB_NOINTR is set on sockbuf, sleep is interruptible. * Returns error without lock if sleep is interrupted. */ #define sblock(sb, wf) ((sb)->sb_flags & SB_LOCK ? \ (((wf) == M_WAITOK) ? sb_lock(sb) : EWOULDBLOCK) : \ ((sb)->sb_flags |= SB_LOCK), 0) /* release lock on sockbuf sb */ #define sbunlock(sb) do { \ SOCKBUF_LOCK_ASSERT(sb); \ (sb)->sb_flags &= ~SB_LOCK; \ if ((sb)->sb_flags & SB_WANT) { \ (sb)->sb_flags &= ~SB_WANT; \ wakeup(&(sb)->sb_flags); \ } \ } while (0) /* * soref()/sorele() ref-count the socket structure. Note that you must * still explicitly close the socket, but the last ref count will free * the structure. */ #define soref(so) do { \ SOCK_LOCK_ASSERT(so); \ ++(so)->so_count; \ } while (0) #define sorele(so) do { \ SOCK_LOCK_ASSERT(so); \ if ((so)->so_count <= 0) \ panic("sorele"); \ if (--(so)->so_count == 0) \ sofree(so); \ else \ SOCK_UNLOCK(so); \ } while (0) #define sotryfree(so) do { \ SOCK_LOCK_ASSERT(so); \ if ((so)->so_count == 0) \ sofree(so); \ else \ SOCK_UNLOCK(so); \ } while(0) /* * In sorwakeup() and sowwakeup(), acquire the socket buffer lock to * avoid a non-atomic test-and-wakeup. However, sowakeup is * responsible for releasing the lock if it is called. We unlock only * if we don't call into sowakeup. If any code is introduced that * directly invokes the underlying sowakeup() primitives, it must * maintain the same semantics. */ #define sorwakeup_locked(so) do { \ SOCKBUF_LOCK_ASSERT(&(so)->so_rcv); \ if (sb_notify(&(so)->so_rcv)) \ sowakeup((so), &(so)->so_rcv); \ else \ SOCKBUF_UNLOCK(&(so)->so_rcv); \ } while (0) #define sorwakeup(so) do { \ SOCKBUF_LOCK(&(so)->so_rcv); \ sorwakeup_locked(so); \ } while (0) #define sowwakeup_locked(so) do { \ SOCKBUF_LOCK_ASSERT(&(so)->so_snd); \ if (sb_notify(&(so)->so_snd)) \ sowakeup((so), &(so)->so_snd); \ else \ SOCKBUF_UNLOCK(&(so)->so_snd); \ } while (0) #define sowwakeup(so) do { \ SOCKBUF_LOCK(&(so)->so_snd); \ sowwakeup_locked(so); \ } while (0) /* * Argument structure for sosetopt et seq. This is in the KERNEL * section because it will never be visible to user code. */ enum sopt_dir { SOPT_GET, SOPT_SET }; struct sockopt { enum sopt_dir sopt_dir; /* is this a get or a set? */ int sopt_level; /* second arg of [gs]etsockopt */ int sopt_name; /* third arg of [gs]etsockopt */ void *sopt_val; /* fourth arg of [gs]etsockopt */ size_t sopt_valsize; /* (almost) fifth arg of [gs]etsockopt */ struct thread *sopt_td; /* calling thread or null if kernel */ }; struct accept_filter { char accf_name[16]; void (*accf_callback) (struct socket *so, void *arg, int waitflag); void * (*accf_create) (struct socket *so, char *arg); void (*accf_destroy) (struct socket *so); SLIST_ENTRY(accept_filter) accf_next; }; #ifdef MALLOC_DECLARE MALLOC_DECLARE(M_ACCF); MALLOC_DECLARE(M_PCB); MALLOC_DECLARE(M_SONAME); #endif extern int maxsockets; extern u_long sb_max; extern struct uma_zone *socket_zone; extern so_gen_t so_gencnt; struct mbuf; struct sockaddr; struct ucred; struct uio; /* * From uipc_socket and friends */ +int kern_setsockopt(struct socket *so, int level, int optname, + void *optval, size_t optlen); int sockargs(struct mbuf **mp, caddr_t buf, int buflen, int type); int getsockaddr(struct sockaddr **namp, caddr_t uaddr, size_t len); void sbappend(struct sockbuf *sb, struct mbuf *m); void sbappend_locked(struct sockbuf *sb, struct mbuf *m); void sbappendstream(struct sockbuf *sb, struct mbuf *m); void sbappendstream_locked(struct sockbuf *sb, struct mbuf *m); int sbappendaddr(struct sockbuf *sb, const struct sockaddr *asa, struct mbuf *m0, struct mbuf *control); int sbappendaddr_locked(struct sockbuf *sb, const struct sockaddr *asa, struct mbuf *m0, struct mbuf *control); int sbappendcontrol(struct sockbuf *sb, struct mbuf *m0, struct mbuf *control); int sbappendcontrol_locked(struct sockbuf *sb, struct mbuf *m0, struct mbuf *control); void sbappendrecord(struct sockbuf *sb, struct mbuf *m0); void sbappendrecord_locked(struct sockbuf *sb, struct mbuf *m0); void sbcheck(struct sockbuf *sb); void sbcompress(struct sockbuf *sb, struct mbuf *m, struct mbuf *n); struct mbuf * sbcreatecontrol(caddr_t p, int size, int type, int level); void sbdrop(struct sockbuf *sb, int len); void sbdrop_locked(struct sockbuf *sb, int len); void sbdroprecord(struct sockbuf *sb); void sbdroprecord_locked(struct sockbuf *sb); void sbflush(struct sockbuf *sb); void sbflush_locked(struct sockbuf *sb); void sbinsertoob(struct sockbuf *sb, struct mbuf *m0); void sbinsertoob_locked(struct sockbuf *sb, struct mbuf *m0); void sbrelease(struct sockbuf *sb, struct socket *so); void sbrelease_locked(struct sockbuf *sb, struct socket *so); int sbreserve(struct sockbuf *sb, u_long cc, struct socket *so, struct thread *td); int sbreserve_locked(struct sockbuf *sb, u_long cc, struct socket *so, struct thread *td); void sbtoxsockbuf(struct sockbuf *sb, struct xsockbuf *xsb); int sbwait(struct sockbuf *sb); int sb_lock(struct sockbuf *sb); int soabort(struct socket *so); int soaccept(struct socket *so, struct sockaddr **nam); struct socket *soalloc(int mflags); int socheckuid(struct socket *so, uid_t uid); int sobind(struct socket *so, struct sockaddr *nam, struct thread *td); void socantrcvmore(struct socket *so); void socantrcvmore_locked(struct socket *so); void socantsendmore(struct socket *so); void socantsendmore_locked(struct socket *so); int soclose(struct socket *so); int soconnect(struct socket *so, struct sockaddr *nam, struct thread *td); int soconnect2(struct socket *so1, struct socket *so2); int socow_setup(struct mbuf *m0, struct uio *uio); int socreate(int dom, struct socket **aso, int type, int proto, struct ucred *cred, struct thread *td); void sodealloc(struct socket *so); int sodisconnect(struct socket *so); struct sockaddr *sodupsockaddr(const struct sockaddr *sa, int mflags); void sofree(struct socket *so); int sogetopt(struct socket *so, struct sockopt *sopt); void sohasoutofband(struct socket *so); void soisconnected(struct socket *so); void soisconnecting(struct socket *so); void soisdisconnected(struct socket *so); void soisdisconnecting(struct socket *so); int solisten(struct socket *so, int backlog, struct thread *td); struct socket * sonewconn(struct socket *head, int connstatus); int sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen); int sooptcopyout(struct sockopt *sopt, const void *buf, size_t len); /* XXX; prepare mbuf for (__FreeBSD__ < 3) routines. */ int soopt_getm(struct sockopt *sopt, struct mbuf **mp); int soopt_mcopyin(struct sockopt *sopt, struct mbuf *m); int soopt_mcopyout(struct sockopt *sopt, struct mbuf *m); int sopoll(struct socket *so, int events, struct ucred *active_cred, struct thread *td); int soreceive(struct socket *so, struct sockaddr **paddr, struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp); int soreserve(struct socket *so, u_long sndcc, u_long rcvcc); void sorflush(struct socket *so); int sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, struct mbuf *top, struct mbuf *control, int flags, struct thread *td); int sosetopt(struct socket *so, struct sockopt *sopt); int soshutdown(struct socket *so, int how); void sotoxsocket(struct socket *so, struct xsocket *xso); void sowakeup(struct socket *so, struct sockbuf *sb); #ifdef SOCKBUF_DEBUG void sblastrecordchk(struct sockbuf *, const char *, int); #define SBLASTRECORDCHK(sb) sblastrecordchk((sb), __FILE__, __LINE__) void sblastmbufchk(struct sockbuf *, const char *, int); #define SBLASTMBUFCHK(sb) sblastmbufchk((sb), __FILE__, __LINE__) #else #define SBLASTRECORDCHK(sb) /* nothing */ #define SBLASTMBUFCHK(sb) /* nothing */ #endif /* SOCKBUF_DEBUG */ /* * Accept filter functions (duh). */ int accept_filt_add(struct accept_filter *filt); int accept_filt_del(char *name); struct accept_filter *accept_filt_get(char *name); #ifdef ACCEPT_FILTER_MOD #ifdef SYSCTL_DECL SYSCTL_DECL(_net_inet_accf); #endif int accept_filt_generic_mod_event(module_t mod, int event, void *data); #endif #endif /* _KERNEL */ #endif /* !_SYS_SOCKETVAR_H_ */