Index: head/sys/cam/ctl/ctl_ha.c =================================================================== --- head/sys/cam/ctl/ctl_ha.c (revision 319721) +++ head/sys/cam/ctl/ctl_ha.c (revision 319722) @@ -1,1030 +1,1006 @@ /*- * Copyright (c) 2015 Alexander Motin * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer, * without modification, immediately at the beginning of the file. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if (__FreeBSD_version < 1100000) struct mbufq { struct mbuf *head; struct mbuf *tail; }; static void mbufq_init(struct mbufq *q, int limit) { q->head = q->tail = NULL; } static void mbufq_drain(struct mbufq *q) { struct mbuf *m; while ((m = q->head) != NULL) { q->head = m->m_nextpkt; m_freem(m); } q->tail = NULL; } static struct mbuf * mbufq_dequeue(struct mbufq *q) { struct mbuf *m; m = q->head; if (m) { if (q->tail == m) q->tail = NULL; q->head = m->m_nextpkt; m->m_nextpkt = NULL; } return (m); } static void mbufq_enqueue(struct mbufq *q, struct mbuf *m) { m->m_nextpkt = NULL; if (q->tail) q->tail->m_nextpkt = m; else q->head = m; q->tail = m; } static u_int sbavail(struct sockbuf *sb) { return (sb->sb_cc); } #if (__FreeBSD_version < 1000000) #define mtodo(m, o) ((void *)(((m)->m_data) + (o))) #endif #endif struct ha_msg_wire { uint32_t channel; uint32_t length; }; struct ha_dt_msg_wire { ctl_ha_dt_cmd command; uint32_t size; uint8_t *local; uint8_t *remote; }; struct ha_softc { struct ctl_softc *ha_ctl_softc; ctl_evt_handler ha_handler[CTL_HA_CHAN_MAX]; char ha_peer[128]; struct sockaddr_in ha_peer_in; struct socket *ha_lso; struct socket *ha_so; struct mbufq ha_sendq; struct mbuf *ha_sending; struct mtx ha_lock; int ha_connect; int ha_listen; int ha_connected; int ha_receiving; int ha_wakeup; int ha_disconnect; int ha_shutdown; eventhandler_tag ha_shutdown_eh; TAILQ_HEAD(, ctl_ha_dt_req) ha_dts; } ha_softc; static void ctl_ha_conn_wake(struct ha_softc *softc) { mtx_lock(&softc->ha_lock); softc->ha_wakeup = 1; mtx_unlock(&softc->ha_lock); wakeup(&softc->ha_wakeup); } static int ctl_ha_lupcall(struct socket *so, void *arg, int waitflag) { struct ha_softc *softc = arg; ctl_ha_conn_wake(softc); return (SU_OK); } static int ctl_ha_rupcall(struct socket *so, void *arg, int waitflag) { struct ha_softc *softc = arg; wakeup(&softc->ha_receiving); return (SU_OK); } static int ctl_ha_supcall(struct socket *so, void *arg, int waitflag) { struct ha_softc *softc = arg; ctl_ha_conn_wake(softc); return (SU_OK); } static void ctl_ha_evt(struct ha_softc *softc, ctl_ha_channel ch, ctl_ha_event evt, int param) { int i; if (ch < CTL_HA_CHAN_MAX) { if (softc->ha_handler[ch]) softc->ha_handler[ch](ch, evt, param); return; } for (i = 0; i < CTL_HA_CHAN_MAX; i++) { if (softc->ha_handler[i]) softc->ha_handler[i](i, evt, param); } } static void ctl_ha_close(struct ha_softc *softc) { struct socket *so = softc->ha_so; int report = 0; if (softc->ha_connected || softc->ha_disconnect) { softc->ha_connected = 0; mbufq_drain(&softc->ha_sendq); m_freem(softc->ha_sending); softc->ha_sending = NULL; report = 1; } if (so) { SOCKBUF_LOCK(&so->so_rcv); soupcall_clear(so, SO_RCV); while (softc->ha_receiving) { wakeup(&softc->ha_receiving); msleep(&softc->ha_receiving, SOCKBUF_MTX(&so->so_rcv), 0, "ha_rx exit", 0); } SOCKBUF_UNLOCK(&so->so_rcv); SOCKBUF_LOCK(&so->so_snd); soupcall_clear(so, SO_SND); SOCKBUF_UNLOCK(&so->so_snd); softc->ha_so = NULL; if (softc->ha_connect) pause("reconnect", hz / 2); soclose(so); } if (report) { ctl_ha_evt(softc, CTL_HA_CHAN_MAX, CTL_HA_EVT_LINK_CHANGE, (softc->ha_connect || softc->ha_listen) ? CTL_HA_LINK_UNKNOWN : CTL_HA_LINK_OFFLINE); } } static void ctl_ha_lclose(struct ha_softc *softc) { if (softc->ha_lso) { SOCKBUF_LOCK(&softc->ha_lso->so_rcv); soupcall_clear(softc->ha_lso, SO_RCV); SOCKBUF_UNLOCK(&softc->ha_lso->so_rcv); soclose(softc->ha_lso); softc->ha_lso = NULL; } } static void ctl_ha_rx_thread(void *arg) { struct ha_softc *softc = arg; struct socket *so = softc->ha_so; struct ha_msg_wire wire_hdr; struct uio uio; struct iovec iov; int error, flags, next; bzero(&wire_hdr, sizeof(wire_hdr)); while (1) { if (wire_hdr.length > 0) next = wire_hdr.length; else next = sizeof(wire_hdr); SOCKBUF_LOCK(&so->so_rcv); while (sbavail(&so->so_rcv) < next || softc->ha_disconnect) { if (softc->ha_connected == 0 || softc->ha_disconnect || so->so_error || (so->so_rcv.sb_state & SBS_CANTRCVMORE)) { goto errout; } so->so_rcv.sb_lowat = next; msleep(&softc->ha_receiving, SOCKBUF_MTX(&so->so_rcv), 0, "-", 0); } SOCKBUF_UNLOCK(&so->so_rcv); if (wire_hdr.length == 0) { iov.iov_base = &wire_hdr; iov.iov_len = sizeof(wire_hdr); uio.uio_iov = &iov; uio.uio_iovcnt = 1; uio.uio_rw = UIO_READ; uio.uio_segflg = UIO_SYSSPACE; uio.uio_td = curthread; uio.uio_resid = sizeof(wire_hdr); flags = MSG_DONTWAIT; error = soreceive(softc->ha_so, NULL, &uio, NULL, NULL, &flags); if (error != 0) { printf("%s: header receive error %d\n", __func__, error); SOCKBUF_LOCK(&so->so_rcv); goto errout; } } else { ctl_ha_evt(softc, wire_hdr.channel, CTL_HA_EVT_MSG_RECV, wire_hdr.length); wire_hdr.length = 0; } } errout: softc->ha_receiving = 0; wakeup(&softc->ha_receiving); SOCKBUF_UNLOCK(&so->so_rcv); ctl_ha_conn_wake(softc); kthread_exit(); } static void ctl_ha_send(struct ha_softc *softc) { struct socket *so = softc->ha_so; int error; while (1) { if (softc->ha_sending == NULL) { mtx_lock(&softc->ha_lock); softc->ha_sending = mbufq_dequeue(&softc->ha_sendq); mtx_unlock(&softc->ha_lock); if (softc->ha_sending == NULL) { so->so_snd.sb_lowat = so->so_snd.sb_hiwat + 1; break; } } SOCKBUF_LOCK(&so->so_snd); if (sbspace(&so->so_snd) < softc->ha_sending->m_pkthdr.len) { so->so_snd.sb_lowat = softc->ha_sending->m_pkthdr.len; SOCKBUF_UNLOCK(&so->so_snd); break; } SOCKBUF_UNLOCK(&so->so_snd); error = sosend(softc->ha_so, NULL, NULL, softc->ha_sending, NULL, MSG_DONTWAIT, curthread); softc->ha_sending = NULL; if (error != 0) { printf("%s: sosend() error %d\n", __func__, error); return; } } } static void ctl_ha_sock_setup(struct ha_softc *softc) { struct sockopt opt; struct socket *so = softc->ha_so; int error, val; val = 1024 * 1024; error = soreserve(so, val, val); if (error) printf("%s: soreserve failed %d\n", __func__, error); SOCKBUF_LOCK(&so->so_rcv); so->so_rcv.sb_lowat = sizeof(struct ha_msg_wire); soupcall_set(so, SO_RCV, ctl_ha_rupcall, softc); SOCKBUF_UNLOCK(&so->so_rcv); SOCKBUF_LOCK(&so->so_snd); so->so_snd.sb_lowat = sizeof(struct ha_msg_wire); soupcall_set(so, SO_SND, ctl_ha_supcall, softc); SOCKBUF_UNLOCK(&so->so_snd); bzero(&opt, sizeof(struct sockopt)); opt.sopt_dir = SOPT_SET; opt.sopt_level = SOL_SOCKET; opt.sopt_name = SO_KEEPALIVE; opt.sopt_val = &val; opt.sopt_valsize = sizeof(val); val = 1; error = sosetopt(so, &opt); if (error) printf("%s: KEEPALIVE setting failed %d\n", __func__, error); opt.sopt_level = IPPROTO_TCP; opt.sopt_name = TCP_NODELAY; val = 1; error = sosetopt(so, &opt); if (error) printf("%s: NODELAY setting failed %d\n", __func__, error); opt.sopt_name = TCP_KEEPINIT; val = 3; error = sosetopt(so, &opt); if (error) printf("%s: KEEPINIT setting failed %d\n", __func__, error); opt.sopt_name = TCP_KEEPIDLE; val = 1; error = sosetopt(so, &opt); if (error) printf("%s: KEEPIDLE setting failed %d\n", __func__, error); opt.sopt_name = TCP_KEEPINTVL; val = 1; error = sosetopt(so, &opt); if (error) printf("%s: KEEPINTVL setting failed %d\n", __func__, error); opt.sopt_name = TCP_KEEPCNT; val = 5; error = sosetopt(so, &opt); if (error) printf("%s: KEEPCNT setting failed %d\n", __func__, error); } static int ctl_ha_connect(struct ha_softc *softc) { struct thread *td = curthread; struct sockaddr_in sa; struct socket *so; int error; /* Create the socket */ error = socreate(PF_INET, &so, SOCK_STREAM, IPPROTO_TCP, td->td_ucred, td); if (error != 0) { printf("%s: socreate() error %d\n", __func__, error); return (error); } softc->ha_so = so; ctl_ha_sock_setup(softc); memcpy(&sa, &softc->ha_peer_in, sizeof(sa)); error = soconnect(so, (struct sockaddr *)&sa, td); if (error != 0) { if (bootverbose) printf("%s: soconnect() error %d\n", __func__, error); goto out; } return (0); out: ctl_ha_close(softc); return (error); } static int ctl_ha_accept(struct ha_softc *softc) { - struct socket *so; + struct socket *lso, *so; struct sockaddr *sap; int error; - ACCEPT_LOCK(); - if (softc->ha_lso->so_rcv.sb_state & SBS_CANTRCVMORE) - softc->ha_lso->so_error = ECONNABORTED; - if (softc->ha_lso->so_error) { - error = softc->ha_lso->so_error; - softc->ha_lso->so_error = 0; - ACCEPT_UNLOCK(); + lso = softc->ha_lso; + SOLISTEN_LOCK(lso); + error = solisten_dequeue(lso, &so, 0); + if (error == EWOULDBLOCK) + return (error); + if (error) { printf("%s: socket error %d\n", __func__, error); goto out; } - so = TAILQ_FIRST(&softc->ha_lso->so_comp); - if (so == NULL) { - ACCEPT_UNLOCK(); - return (EWOULDBLOCK); - } - KASSERT(!(so->so_qstate & SQ_INCOMP), ("accept1: so SQ_INCOMP")); - KASSERT(so->so_qstate & SQ_COMP, ("accept1: so not SQ_COMP")); - /* - * Before changing the flags on the socket, we have to bump the - * reference count. Otherwise, if the protocol calls sofree(), - * the socket will be released due to a zero refcount. - */ - SOCK_LOCK(so); /* soref() and so_state update */ - soref(so); /* file descriptor reference */ - - TAILQ_REMOVE(&softc->ha_lso->so_comp, so, so_list); - softc->ha_lso->so_qlen--; - so->so_state |= SS_NBIO; - so->so_qstate &= ~SQ_COMP; - so->so_head = NULL; - - SOCK_UNLOCK(so); - ACCEPT_UNLOCK(); - sap = NULL; error = soaccept(so, &sap); if (error != 0) { printf("%s: soaccept() error %d\n", __func__, error); if (sap != NULL) free(sap, M_SONAME); goto out; } if (sap != NULL) free(sap, M_SONAME); softc->ha_so = so; ctl_ha_sock_setup(softc); return (0); out: ctl_ha_lclose(softc); return (error); } static int ctl_ha_listen(struct ha_softc *softc) { struct thread *td = curthread; struct sockaddr_in sa; struct sockopt opt; int error, val; /* Create the socket */ if (softc->ha_lso == NULL) { error = socreate(PF_INET, &softc->ha_lso, SOCK_STREAM, IPPROTO_TCP, td->td_ucred, td); if (error != 0) { printf("%s: socreate() error %d\n", __func__, error); return (error); } bzero(&opt, sizeof(struct sockopt)); opt.sopt_dir = SOPT_SET; opt.sopt_level = SOL_SOCKET; opt.sopt_name = SO_REUSEADDR; opt.sopt_val = &val; opt.sopt_valsize = sizeof(val); val = 1; error = sosetopt(softc->ha_lso, &opt); if (error) { printf("%s: REUSEADDR setting failed %d\n", __func__, error); } bzero(&opt, sizeof(struct sockopt)); opt.sopt_dir = SOPT_SET; opt.sopt_level = SOL_SOCKET; opt.sopt_name = SO_REUSEPORT; opt.sopt_val = &val; opt.sopt_valsize = sizeof(val); val = 1; error = sosetopt(softc->ha_lso, &opt); if (error) { printf("%s: REUSEPORT setting failed %d\n", __func__, error); } - SOCKBUF_LOCK(&softc->ha_lso->so_rcv); - soupcall_set(softc->ha_lso, SO_RCV, ctl_ha_lupcall, softc); - SOCKBUF_UNLOCK(&softc->ha_lso->so_rcv); } memcpy(&sa, &softc->ha_peer_in, sizeof(sa)); error = sobind(softc->ha_lso, (struct sockaddr *)&sa, td); if (error != 0) { printf("%s: sobind() error %d\n", __func__, error); goto out; } error = solisten(softc->ha_lso, 1, td); if (error != 0) { printf("%s: solisten() error %d\n", __func__, error); goto out; } + SOLISTEN_LOCK(softc->ha_lso); + softc->ha_lso->so_state |= SS_NBIO; + solisten_upcall_set(softc->ha_lso, ctl_ha_lupcall, softc); + SOLISTEN_UNLOCK(softc->ha_lso); return (0); out: ctl_ha_lclose(softc); return (error); } static void ctl_ha_conn_thread(void *arg) { struct ha_softc *softc = arg; int error; while (1) { if (softc->ha_disconnect || softc->ha_shutdown) { ctl_ha_close(softc); if (softc->ha_disconnect == 2 || softc->ha_shutdown) ctl_ha_lclose(softc); softc->ha_disconnect = 0; if (softc->ha_shutdown) break; } else if (softc->ha_so != NULL && (softc->ha_so->so_error || softc->ha_so->so_rcv.sb_state & SBS_CANTRCVMORE)) ctl_ha_close(softc); if (softc->ha_so == NULL) { if (softc->ha_lso != NULL) ctl_ha_accept(softc); else if (softc->ha_listen) ctl_ha_listen(softc); else if (softc->ha_connect) ctl_ha_connect(softc); } if (softc->ha_so != NULL) { if (softc->ha_connected == 0 && softc->ha_so->so_error == 0 && (softc->ha_so->so_state & SS_ISCONNECTING) == 0) { softc->ha_connected = 1; ctl_ha_evt(softc, CTL_HA_CHAN_MAX, CTL_HA_EVT_LINK_CHANGE, CTL_HA_LINK_ONLINE); softc->ha_receiving = 1; error = kproc_kthread_add(ctl_ha_rx_thread, softc, &softc->ha_ctl_softc->ctl_proc, NULL, 0, 0, "ctl", "ha_rx"); if (error != 0) { printf("Error creating CTL HA rx thread!\n"); softc->ha_receiving = 0; softc->ha_disconnect = 1; } } ctl_ha_send(softc); } mtx_lock(&softc->ha_lock); if (softc->ha_so != NULL && (softc->ha_so->so_error || softc->ha_so->so_rcv.sb_state & SBS_CANTRCVMORE)) ; else if (!softc->ha_wakeup) msleep(&softc->ha_wakeup, &softc->ha_lock, 0, "-", hz); softc->ha_wakeup = 0; mtx_unlock(&softc->ha_lock); } mtx_lock(&softc->ha_lock); softc->ha_shutdown = 2; wakeup(&softc->ha_wakeup); mtx_unlock(&softc->ha_lock); kthread_exit(); } static int ctl_ha_peer_sysctl(SYSCTL_HANDLER_ARGS) { struct ha_softc *softc = (struct ha_softc *)arg1; struct sockaddr_in *sa; int error, b1, b2, b3, b4, p, num; char buf[128]; strlcpy(buf, softc->ha_peer, sizeof(buf)); error = sysctl_handle_string(oidp, buf, sizeof(buf), req); if ((error != 0) || (req->newptr == NULL) || strncmp(buf, softc->ha_peer, sizeof(buf)) == 0) return (error); sa = &softc->ha_peer_in; mtx_lock(&softc->ha_lock); if ((num = sscanf(buf, "connect %d.%d.%d.%d:%d", &b1, &b2, &b3, &b4, &p)) >= 4) { softc->ha_connect = 1; softc->ha_listen = 0; } else if ((num = sscanf(buf, "listen %d.%d.%d.%d:%d", &b1, &b2, &b3, &b4, &p)) >= 4) { softc->ha_connect = 0; softc->ha_listen = 1; } else { softc->ha_connect = 0; softc->ha_listen = 0; if (buf[0] != 0) { buf[0] = 0; error = EINVAL; } } strlcpy(softc->ha_peer, buf, sizeof(softc->ha_peer)); if (softc->ha_connect || softc->ha_listen) { memset(sa, 0, sizeof(*sa)); sa->sin_len = sizeof(struct sockaddr_in); sa->sin_family = AF_INET; sa->sin_port = htons((num >= 5) ? p : 999); sa->sin_addr.s_addr = htonl((b1 << 24) + (b2 << 16) + (b3 << 8) + b4); } softc->ha_disconnect = 2; softc->ha_wakeup = 1; mtx_unlock(&softc->ha_lock); wakeup(&softc->ha_wakeup); return (error); } ctl_ha_status ctl_ha_msg_register(ctl_ha_channel channel, ctl_evt_handler handler) { struct ha_softc *softc = &ha_softc; KASSERT(channel < CTL_HA_CHAN_MAX, ("Wrong CTL HA channel %d", channel)); softc->ha_handler[channel] = handler; return (CTL_HA_STATUS_SUCCESS); } ctl_ha_status ctl_ha_msg_deregister(ctl_ha_channel channel) { struct ha_softc *softc = &ha_softc; KASSERT(channel < CTL_HA_CHAN_MAX, ("Wrong CTL HA channel %d", channel)); softc->ha_handler[channel] = NULL; return (CTL_HA_STATUS_SUCCESS); } /* * Receive a message of the specified size. */ ctl_ha_status ctl_ha_msg_recv(ctl_ha_channel channel, void *addr, size_t len, int wait) { struct ha_softc *softc = &ha_softc; struct uio uio; struct iovec iov; int error, flags; if (!softc->ha_connected) return (CTL_HA_STATUS_DISCONNECT); iov.iov_base = addr; iov.iov_len = len; uio.uio_iov = &iov; uio.uio_iovcnt = 1; uio.uio_rw = UIO_READ; uio.uio_segflg = UIO_SYSSPACE; uio.uio_td = curthread; uio.uio_resid = len; flags = wait ? 0 : MSG_DONTWAIT; error = soreceive(softc->ha_so, NULL, &uio, NULL, NULL, &flags); if (error == 0) return (CTL_HA_STATUS_SUCCESS); /* Consider all errors fatal for HA sanity. */ mtx_lock(&softc->ha_lock); if (softc->ha_connected) { softc->ha_disconnect = 1; softc->ha_wakeup = 1; wakeup(&softc->ha_wakeup); } mtx_unlock(&softc->ha_lock); return (CTL_HA_STATUS_ERROR); } /* * Send a message of the specified size. */ ctl_ha_status ctl_ha_msg_send2(ctl_ha_channel channel, const void *addr, size_t len, const void *addr2, size_t len2, int wait) { struct ha_softc *softc = &ha_softc; struct mbuf *mb, *newmb; struct ha_msg_wire hdr; size_t copylen, off; if (!softc->ha_connected) return (CTL_HA_STATUS_DISCONNECT); newmb = m_getm2(NULL, sizeof(hdr) + len + len2, wait, MT_DATA, M_PKTHDR); if (newmb == NULL) { /* Consider all errors fatal for HA sanity. */ mtx_lock(&softc->ha_lock); if (softc->ha_connected) { softc->ha_disconnect = 1; softc->ha_wakeup = 1; wakeup(&softc->ha_wakeup); } mtx_unlock(&softc->ha_lock); printf("%s: Can't allocate mbuf chain\n", __func__); return (CTL_HA_STATUS_ERROR); } hdr.channel = channel; hdr.length = len + len2; mb = newmb; memcpy(mtodo(mb, 0), &hdr, sizeof(hdr)); mb->m_len += sizeof(hdr); off = 0; for (; mb != NULL && off < len; mb = mb->m_next) { copylen = min(M_TRAILINGSPACE(mb), len - off); memcpy(mtodo(mb, mb->m_len), (const char *)addr + off, copylen); mb->m_len += copylen; off += copylen; if (off == len) break; } KASSERT(off == len, ("%s: off (%zu) != len (%zu)", __func__, off, len)); off = 0; for (; mb != NULL && off < len2; mb = mb->m_next) { copylen = min(M_TRAILINGSPACE(mb), len2 - off); memcpy(mtodo(mb, mb->m_len), (const char *)addr2 + off, copylen); mb->m_len += copylen; off += copylen; } KASSERT(off == len2, ("%s: off (%zu) != len2 (%zu)", __func__, off, len2)); newmb->m_pkthdr.len = sizeof(hdr) + len + len2; mtx_lock(&softc->ha_lock); if (!softc->ha_connected) { mtx_unlock(&softc->ha_lock); m_freem(newmb); return (CTL_HA_STATUS_DISCONNECT); } mbufq_enqueue(&softc->ha_sendq, newmb); softc->ha_wakeup = 1; mtx_unlock(&softc->ha_lock); wakeup(&softc->ha_wakeup); return (CTL_HA_STATUS_SUCCESS); } ctl_ha_status ctl_ha_msg_send(ctl_ha_channel channel, const void *addr, size_t len, int wait) { return (ctl_ha_msg_send2(channel, addr, len, NULL, 0, wait)); } ctl_ha_status ctl_ha_msg_abort(ctl_ha_channel channel) { struct ha_softc *softc = &ha_softc; mtx_lock(&softc->ha_lock); softc->ha_disconnect = 1; softc->ha_wakeup = 1; mtx_unlock(&softc->ha_lock); wakeup(&softc->ha_wakeup); return (CTL_HA_STATUS_SUCCESS); } /* * Allocate a data transfer request structure. */ struct ctl_ha_dt_req * ctl_dt_req_alloc(void) { return (malloc(sizeof(struct ctl_ha_dt_req), M_CTL, M_WAITOK | M_ZERO)); } /* * Free a data transfer request structure. */ void ctl_dt_req_free(struct ctl_ha_dt_req *req) { free(req, M_CTL); } /* * Issue a DMA request for a single buffer. */ ctl_ha_status ctl_dt_single(struct ctl_ha_dt_req *req) { struct ha_softc *softc = &ha_softc; struct ha_dt_msg_wire wire_dt; ctl_ha_status status; wire_dt.command = req->command; wire_dt.size = req->size; wire_dt.local = req->local; wire_dt.remote = req->remote; if (req->command == CTL_HA_DT_CMD_READ && req->callback != NULL) { mtx_lock(&softc->ha_lock); TAILQ_INSERT_TAIL(&softc->ha_dts, req, links); mtx_unlock(&softc->ha_lock); ctl_ha_msg_send(CTL_HA_CHAN_DATA, &wire_dt, sizeof(wire_dt), M_WAITOK); return (CTL_HA_STATUS_WAIT); } if (req->command == CTL_HA_DT_CMD_READ) { status = ctl_ha_msg_send(CTL_HA_CHAN_DATA, &wire_dt, sizeof(wire_dt), M_WAITOK); } else { status = ctl_ha_msg_send2(CTL_HA_CHAN_DATA, &wire_dt, sizeof(wire_dt), req->local, req->size, M_WAITOK); } return (status); } static void ctl_dt_event_handler(ctl_ha_channel channel, ctl_ha_event event, int param) { struct ha_softc *softc = &ha_softc; struct ctl_ha_dt_req *req; ctl_ha_status isc_status; if (event == CTL_HA_EVT_MSG_RECV) { struct ha_dt_msg_wire wire_dt; uint8_t *tmp; int size; size = min(sizeof(wire_dt), param); isc_status = ctl_ha_msg_recv(CTL_HA_CHAN_DATA, &wire_dt, size, M_WAITOK); if (isc_status != CTL_HA_STATUS_SUCCESS) { printf("%s: Error receiving message: %d\n", __func__, isc_status); return; } if (wire_dt.command == CTL_HA_DT_CMD_READ) { wire_dt.command = CTL_HA_DT_CMD_WRITE; tmp = wire_dt.local; wire_dt.local = wire_dt.remote; wire_dt.remote = tmp; ctl_ha_msg_send2(CTL_HA_CHAN_DATA, &wire_dt, sizeof(wire_dt), wire_dt.local, wire_dt.size, M_WAITOK); } else if (wire_dt.command == CTL_HA_DT_CMD_WRITE) { isc_status = ctl_ha_msg_recv(CTL_HA_CHAN_DATA, wire_dt.remote, wire_dt.size, M_WAITOK); mtx_lock(&softc->ha_lock); TAILQ_FOREACH(req, &softc->ha_dts, links) { if (req->local == wire_dt.remote) { TAILQ_REMOVE(&softc->ha_dts, req, links); break; } } mtx_unlock(&softc->ha_lock); if (req) { req->ret = isc_status; req->callback(req); } } } else if (event == CTL_HA_EVT_LINK_CHANGE) { CTL_DEBUG_PRINT(("%s: Link state change to %d\n", __func__, param)); if (param != CTL_HA_LINK_ONLINE) { mtx_lock(&softc->ha_lock); while ((req = TAILQ_FIRST(&softc->ha_dts)) != NULL) { TAILQ_REMOVE(&softc->ha_dts, req, links); mtx_unlock(&softc->ha_lock); req->ret = CTL_HA_STATUS_DISCONNECT; req->callback(req); mtx_lock(&softc->ha_lock); } mtx_unlock(&softc->ha_lock); } } else { printf("%s: Unknown event %d\n", __func__, event); } } ctl_ha_status ctl_ha_msg_init(struct ctl_softc *ctl_softc) { struct ha_softc *softc = &ha_softc; int error; softc->ha_ctl_softc = ctl_softc; mtx_init(&softc->ha_lock, "CTL HA mutex", NULL, MTX_DEF); mbufq_init(&softc->ha_sendq, INT_MAX); TAILQ_INIT(&softc->ha_dts); error = kproc_kthread_add(ctl_ha_conn_thread, softc, &ctl_softc->ctl_proc, NULL, 0, 0, "ctl", "ha_tx"); if (error != 0) { printf("error creating CTL HA connection thread!\n"); mtx_destroy(&softc->ha_lock); return (CTL_HA_STATUS_ERROR); } softc->ha_shutdown_eh = EVENTHANDLER_REGISTER(shutdown_pre_sync, ctl_ha_msg_shutdown, ctl_softc, SHUTDOWN_PRI_FIRST); SYSCTL_ADD_PROC(&ctl_softc->sysctl_ctx, SYSCTL_CHILDREN(ctl_softc->sysctl_tree), OID_AUTO, "ha_peer", CTLTYPE_STRING | CTLFLAG_RWTUN, softc, 0, ctl_ha_peer_sysctl, "A", "HA peer connection method"); if (ctl_ha_msg_register(CTL_HA_CHAN_DATA, ctl_dt_event_handler) != CTL_HA_STATUS_SUCCESS) { printf("%s: ctl_ha_msg_register failed.\n", __func__); } return (CTL_HA_STATUS_SUCCESS); }; void ctl_ha_msg_shutdown(struct ctl_softc *ctl_softc) { struct ha_softc *softc = &ha_softc; /* Disconnect and shutdown threads. */ mtx_lock(&softc->ha_lock); if (softc->ha_shutdown < 2) { softc->ha_shutdown = 1; softc->ha_wakeup = 1; wakeup(&softc->ha_wakeup); while (softc->ha_shutdown < 2 && !SCHEDULER_STOPPED()) { msleep(&softc->ha_wakeup, &softc->ha_lock, 0, "shutdown", hz); } } mtx_unlock(&softc->ha_lock); }; ctl_ha_status ctl_ha_msg_destroy(struct ctl_softc *ctl_softc) { struct ha_softc *softc = &ha_softc; if (softc->ha_shutdown_eh != NULL) { EVENTHANDLER_DEREGISTER(shutdown_pre_sync, softc->ha_shutdown_eh); softc->ha_shutdown_eh = NULL; } ctl_ha_msg_shutdown(ctl_softc); /* Just in case. */ if (ctl_ha_msg_deregister(CTL_HA_CHAN_DATA) != CTL_HA_STATUS_SUCCESS) printf("%s: ctl_ha_msg_deregister failed.\n", __func__); mtx_destroy(&softc->ha_lock); return (CTL_HA_STATUS_SUCCESS); }; Index: head/sys/dev/iscsi/icl_soft_proxy.c =================================================================== --- head/sys/dev/iscsi/icl_soft_proxy.c (revision 319721) +++ head/sys/dev/iscsi/icl_soft_proxy.c (revision 319722) @@ -1,375 +1,343 @@ /*- * Copyright (c) 2012 The FreeBSD Foundation * All rights reserved. * * This software was developed by Edward Tomasz Napierala under sponsorship * from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ /*- * Copyright (c) 1982, 1986, 1989, 1990, 1993 * The Regents of the University of California. All rights reserved. * * sendfile(2) and related extensions: * Copyright (c) 1998, David Greenman. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)uipc_syscalls.c 8.4 (Berkeley) 2/21/94 */ /* * iSCSI Common Layer, kernel proxy part. */ #ifdef ICL_KERNEL_PROXY #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include struct icl_listen_sock { TAILQ_ENTRY(icl_listen_sock) ils_next; struct icl_listen *ils_listen; struct socket *ils_socket; bool ils_running; - bool ils_disconnecting; int ils_id; }; struct icl_listen { TAILQ_HEAD(, icl_listen_sock) il_sockets; struct sx il_lock; void (*il_accept)(struct socket *, struct sockaddr *, int); }; static MALLOC_DEFINE(M_ICL_PROXY, "ICL_PROXY", "iSCSI common layer proxy"); int icl_soft_proxy_connect(struct icl_conn *ic, int domain, int socktype, int protocol, struct sockaddr *from_sa, struct sockaddr *to_sa) { struct socket *so; int error; int interrupted = 0; error = socreate(domain, &so, socktype, protocol, curthread->td_ucred, curthread); if (error != 0) return (error); if (from_sa != NULL) { error = sobind(so, from_sa, curthread); if (error != 0) { soclose(so); return (error); } } error = soconnect(so, to_sa, curthread); if (error != 0) { soclose(so); return (error); } SOCK_LOCK(so); while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) { error = msleep(&so->so_timeo, SOCK_MTX(so), PSOCK | PCATCH, "icl_connect", 0); if (error) { if (error == EINTR || error == ERESTART) interrupted = 1; break; } } if (error == 0) { error = so->so_error; so->so_error = 0; } SOCK_UNLOCK(so); if (error != 0) { soclose(so); return (error); } error = icl_soft_handoff_sock(ic, so); if (error != 0) soclose(so); return (error); } struct icl_listen * icl_listen_new(void (*accept_cb)(struct socket *, struct sockaddr *, int)) { struct icl_listen *il; il = malloc(sizeof(*il), M_ICL_PROXY, M_ZERO | M_WAITOK); TAILQ_INIT(&il->il_sockets); sx_init(&il->il_lock, "icl_listen"); il->il_accept = accept_cb; return (il); } void icl_listen_free(struct icl_listen *il) { struct icl_listen_sock *ils; sx_xlock(&il->il_lock); while (!TAILQ_EMPTY(&il->il_sockets)) { ils = TAILQ_FIRST(&il->il_sockets); while (ils->ils_running) { ICL_DEBUG("waiting for accept thread to terminate"); sx_xunlock(&il->il_lock); - ils->ils_disconnecting = true; + SOLISTEN_LOCK(ils->ils_socket); + ils->ils_socket->so_error = ENOTCONN; + SOLISTEN_UNLOCK(ils->ils_socket); wakeup(&ils->ils_socket->so_timeo); pause("icl_unlisten", 1 * hz); sx_xlock(&il->il_lock); } TAILQ_REMOVE(&il->il_sockets, ils, ils_next); soclose(ils->ils_socket); free(ils, M_ICL_PROXY); } sx_xunlock(&il->il_lock); free(il, M_ICL_PROXY); } /* - * XXX: Doing accept in a separate thread in each socket might not be the best way - * to do stuff, but it's pretty clean and debuggable - and you probably won't - * have hundreds of listening sockets anyway. + * XXX: Doing accept in a separate thread in each socket might not be the + * best way to do stuff, but it's pretty clean and debuggable - and you + * probably won't have hundreds of listening sockets anyway. */ static void icl_accept_thread(void *arg) { struct icl_listen_sock *ils; struct socket *head, *so; struct sockaddr *sa; int error; ils = arg; head = ils->ils_socket; ils->ils_running = true; for (;;) { - ACCEPT_LOCK(); - while (TAILQ_EMPTY(&head->so_comp) && head->so_error == 0 && ils->ils_disconnecting == false) { - if (head->so_rcv.sb_state & SBS_CANTRCVMORE) { - head->so_error = ECONNABORTED; - break; - } - error = msleep(&head->so_timeo, &accept_mtx, PSOCK | PCATCH, - "accept", 0); - if (error) { - ACCEPT_UNLOCK(); - ICL_WARN("msleep failed with error %d", error); - continue; - } - if (ils->ils_disconnecting) { - ACCEPT_UNLOCK(); - ICL_DEBUG("terminating"); - ils->ils_running = false; - kthread_exit(); - return; - } + SOLISTEN_LOCK(head); + error = solisten_dequeue(head, &so, 0); + if (error == ENOTCONN) { + /* + * XXXGL: ENOTCONN is our mark from icl_listen_free(). + * Neither socket code, nor msleep(9) may return it. + */ + ICL_DEBUG("terminating"); + ils->ils_running = false; + kthread_exit(); + return; } - if (head->so_error) { - error = head->so_error; - head->so_error = 0; - ACCEPT_UNLOCK(); - ICL_WARN("socket error %d", error); + if (error) { + ICL_WARN("solisten_dequeue error %d", error); continue; } - so = TAILQ_FIRST(&head->so_comp); - KASSERT(so != NULL, ("NULL so")); - KASSERT(!(so->so_qstate & SQ_INCOMP), ("accept1: so SQ_INCOMP")); - KASSERT(so->so_qstate & SQ_COMP, ("accept1: so not SQ_COMP")); - - /* - * Before changing the flags on the socket, we have to bump the - * reference count. Otherwise, if the protocol calls sofree(), - * the socket will be released due to a zero refcount. - */ - SOCK_LOCK(so); /* soref() and so_state update */ - soref(so); /* file descriptor reference */ - - TAILQ_REMOVE(&head->so_comp, so, so_list); - head->so_qlen--; - so->so_state |= (head->so_state & SS_NBIO); - so->so_qstate &= ~SQ_COMP; - so->so_head = NULL; - - SOCK_UNLOCK(so); - ACCEPT_UNLOCK(); sa = NULL; error = soaccept(so, &sa); if (error != 0) { ICL_WARN("soaccept error %d", error); if (sa != NULL) free(sa, M_SONAME); soclose(so); continue; } (ils->ils_listen->il_accept)(so, sa, ils->ils_id); } } static int icl_listen_add_tcp(struct icl_listen *il, int domain, int socktype, int protocol, struct sockaddr *sa, int portal_id) { struct icl_listen_sock *ils; struct socket *so; struct sockopt sopt; int error, one = 1; error = socreate(domain, &so, socktype, protocol, curthread->td_ucred, curthread); if (error != 0) { ICL_WARN("socreate failed with error %d", error); return (error); } sopt.sopt_dir = SOPT_SET; sopt.sopt_level = SOL_SOCKET; sopt.sopt_name = SO_REUSEADDR; sopt.sopt_val = &one; sopt.sopt_valsize = sizeof(one); sopt.sopt_td = NULL; error = sosetopt(so, &sopt); if (error != 0) { ICL_WARN("failed to set SO_REUSEADDR with error %d", error); soclose(so); return (error); } error = sobind(so, sa, curthread); if (error != 0) { ICL_WARN("sobind failed with error %d", error); soclose(so); return (error); } error = solisten(so, -1, curthread); if (error != 0) { ICL_WARN("solisten failed with error %d", error); soclose(so); return (error); } ils = malloc(sizeof(*ils), M_ICL_PROXY, M_ZERO | M_WAITOK); ils->ils_listen = il; ils->ils_socket = so; ils->ils_id = portal_id; error = kthread_add(icl_accept_thread, ils, NULL, NULL, 0, 0, "iclacc"); if (error != 0) { ICL_WARN("kthread_add failed with error %d", error); soclose(so); free(ils, M_ICL_PROXY); return (error); } sx_xlock(&il->il_lock); TAILQ_INSERT_TAIL(&il->il_sockets, ils, ils_next); sx_xunlock(&il->il_lock); return (0); } int icl_listen_add(struct icl_listen *il, bool rdma, int domain, int socktype, int protocol, struct sockaddr *sa, int portal_id) { if (rdma) { ICL_DEBUG("RDMA not supported"); return (EOPNOTSUPP); } return (icl_listen_add_tcp(il, domain, socktype, protocol, sa, portal_id)); } int icl_listen_remove(struct icl_listen *il, struct sockaddr *sa) { /* * XXX */ return (EOPNOTSUPP); } #endif /* ICL_KERNEL_PROXY */ Index: head/sys/kern/sys_socket.c =================================================================== --- head/sys/kern/sys_socket.c (revision 319721) +++ head/sys/kern/sys_socket.c (revision 319722) @@ -1,815 +1,818 @@ /*- * Copyright (c) 1982, 1986, 1990, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)sys_socket.c 8.1 (Berkeley) 6/10/93 */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* XXX */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static SYSCTL_NODE(_kern_ipc, OID_AUTO, aio, CTLFLAG_RD, NULL, "socket AIO stats"); static int empty_results; SYSCTL_INT(_kern_ipc_aio, OID_AUTO, empty_results, CTLFLAG_RD, &empty_results, 0, "socket operation returned EAGAIN"); static int empty_retries; SYSCTL_INT(_kern_ipc_aio, OID_AUTO, empty_retries, CTLFLAG_RD, &empty_retries, 0, "socket operation retries"); static fo_rdwr_t soo_read; static fo_rdwr_t soo_write; static fo_ioctl_t soo_ioctl; static fo_poll_t soo_poll; extern fo_kqfilter_t soo_kqfilter; static fo_stat_t soo_stat; static fo_close_t soo_close; static fo_fill_kinfo_t soo_fill_kinfo; static fo_aio_queue_t soo_aio_queue; static void soo_aio_cancel(struct kaiocb *job); struct fileops socketops = { .fo_read = soo_read, .fo_write = soo_write, .fo_truncate = invfo_truncate, .fo_ioctl = soo_ioctl, .fo_poll = soo_poll, .fo_kqfilter = soo_kqfilter, .fo_stat = soo_stat, .fo_close = soo_close, .fo_chmod = invfo_chmod, .fo_chown = invfo_chown, .fo_sendfile = invfo_sendfile, .fo_fill_kinfo = soo_fill_kinfo, .fo_aio_queue = soo_aio_queue, .fo_flags = DFLAG_PASSABLE }; static int soo_read(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) { struct socket *so = fp->f_data; int error; #ifdef MAC error = mac_socket_check_receive(active_cred, so); if (error) return (error); #endif error = soreceive(so, 0, uio, 0, 0, 0); return (error); } static int soo_write(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) { struct socket *so = fp->f_data; int error; #ifdef MAC error = mac_socket_check_send(active_cred, so); if (error) return (error); #endif error = sosend(so, 0, uio, 0, 0, 0, uio->uio_td); if (error == EPIPE && (so->so_options & SO_NOSIGPIPE) == 0) { PROC_LOCK(uio->uio_td->td_proc); tdsignal(uio->uio_td, SIGPIPE); PROC_UNLOCK(uio->uio_td->td_proc); } return (error); } static int soo_ioctl(struct file *fp, u_long cmd, void *data, struct ucred *active_cred, struct thread *td) { struct socket *so = fp->f_data; int error = 0; switch (cmd) { case FIONBIO: SOCK_LOCK(so); if (*(int *)data) so->so_state |= SS_NBIO; else so->so_state &= ~SS_NBIO; SOCK_UNLOCK(so); break; case FIOASYNC: - /* - * XXXRW: This code separately acquires SOCK_LOCK(so) and - * SOCKBUF_LOCK(&so->so_rcv) even though they are the same - * mutex to avoid introducing the assumption that they are - * the same. - */ if (*(int *)data) { SOCK_LOCK(so); so->so_state |= SS_ASYNC; + if (SOLISTENING(so)) { + so->sol_sbrcv_flags |= SB_ASYNC; + so->sol_sbsnd_flags |= SB_ASYNC; + } else { + SOCKBUF_LOCK(&so->so_rcv); + so->so_rcv.sb_flags |= SB_ASYNC; + SOCKBUF_UNLOCK(&so->so_rcv); + SOCKBUF_LOCK(&so->so_snd); + so->so_snd.sb_flags |= SB_ASYNC; + SOCKBUF_UNLOCK(&so->so_snd); + } SOCK_UNLOCK(so); - SOCKBUF_LOCK(&so->so_rcv); - so->so_rcv.sb_flags |= SB_ASYNC; - SOCKBUF_UNLOCK(&so->so_rcv); - SOCKBUF_LOCK(&so->so_snd); - so->so_snd.sb_flags |= SB_ASYNC; - SOCKBUF_UNLOCK(&so->so_snd); } else { SOCK_LOCK(so); so->so_state &= ~SS_ASYNC; + if (SOLISTENING(so)) { + so->sol_sbrcv_flags &= ~SB_ASYNC; + so->sol_sbsnd_flags &= ~SB_ASYNC; + } else { + SOCKBUF_LOCK(&so->so_rcv); + so->so_rcv.sb_flags &= ~SB_ASYNC; + SOCKBUF_UNLOCK(&so->so_rcv); + SOCKBUF_LOCK(&so->so_snd); + so->so_snd.sb_flags &= ~SB_ASYNC; + SOCKBUF_UNLOCK(&so->so_snd); + } SOCK_UNLOCK(so); - SOCKBUF_LOCK(&so->so_rcv); - so->so_rcv.sb_flags &= ~SB_ASYNC; - SOCKBUF_UNLOCK(&so->so_rcv); - SOCKBUF_LOCK(&so->so_snd); - so->so_snd.sb_flags &= ~SB_ASYNC; - SOCKBUF_UNLOCK(&so->so_snd); } break; case FIONREAD: /* Unlocked read. */ *(int *)data = sbavail(&so->so_rcv); break; case FIONWRITE: /* Unlocked read. */ *(int *)data = sbavail(&so->so_snd); break; case FIONSPACE: /* Unlocked read. */ if ((so->so_snd.sb_hiwat < sbused(&so->so_snd)) || (so->so_snd.sb_mbmax < so->so_snd.sb_mbcnt)) *(int *)data = 0; else *(int *)data = sbspace(&so->so_snd); break; case FIOSETOWN: error = fsetown(*(int *)data, &so->so_sigio); break; case FIOGETOWN: *(int *)data = fgetown(&so->so_sigio); break; case SIOCSPGRP: error = fsetown(-(*(int *)data), &so->so_sigio); break; case SIOCGPGRP: *(int *)data = -fgetown(&so->so_sigio); break; case SIOCATMARK: /* Unlocked read. */ *(int *)data = (so->so_rcv.sb_state & SBS_RCVATMARK) != 0; break; default: /* * Interface/routing/protocol specific ioctls: interface and * routing ioctls should have a different entry since a * socket is unnecessary. */ if (IOCGROUP(cmd) == 'i') error = ifioctl(so, cmd, data, td); else if (IOCGROUP(cmd) == 'r') { CURVNET_SET(so->so_vnet); error = rtioctl_fib(cmd, data, so->so_fibnum); CURVNET_RESTORE(); } else { CURVNET_SET(so->so_vnet); error = ((*so->so_proto->pr_usrreqs->pru_control) (so, cmd, data, 0, td)); CURVNET_RESTORE(); } break; } return (error); } static int soo_poll(struct file *fp, int events, struct ucred *active_cred, struct thread *td) { struct socket *so = fp->f_data; #ifdef MAC int error; error = mac_socket_check_poll(active_cred, so); if (error) return (error); #endif return (sopoll(so, events, fp->f_cred, td)); } static int soo_stat(struct file *fp, struct stat *ub, struct ucred *active_cred, struct thread *td) { struct socket *so = fp->f_data; struct sockbuf *sb; #ifdef MAC int error; #endif bzero((caddr_t)ub, sizeof (*ub)); ub->st_mode = S_IFSOCK; #ifdef MAC error = mac_socket_check_stat(active_cred, so); if (error) return (error); #endif /* * If SBS_CANTRCVMORE is set, but there's still data left in the * receive buffer, the socket is still readable. */ sb = &so->so_rcv; SOCKBUF_LOCK(sb); if ((sb->sb_state & SBS_CANTRCVMORE) == 0 || sbavail(sb)) ub->st_mode |= S_IRUSR | S_IRGRP | S_IROTH; ub->st_size = sbavail(sb) - sb->sb_ctl; SOCKBUF_UNLOCK(sb); sb = &so->so_snd; SOCKBUF_LOCK(sb); if ((sb->sb_state & SBS_CANTSENDMORE) == 0) ub->st_mode |= S_IWUSR | S_IWGRP | S_IWOTH; SOCKBUF_UNLOCK(sb); ub->st_uid = so->so_cred->cr_uid; ub->st_gid = so->so_cred->cr_gid; return (*so->so_proto->pr_usrreqs->pru_sense)(so, ub); } /* * API socket close on file pointer. We call soclose() to close the socket * (including initiating closing protocols). soclose() will sorele() the * file reference but the actual socket will not go away until the socket's * ref count hits 0. */ static int soo_close(struct file *fp, struct thread *td) { int error = 0; struct socket *so; so = fp->f_data; fp->f_ops = &badfileops; fp->f_data = NULL; if (so) error = soclose(so); return (error); } static int soo_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp) { struct sockaddr *sa; struct inpcb *inpcb; struct unpcb *unpcb; struct socket *so; int error; kif->kf_type = KF_TYPE_SOCKET; so = fp->f_data; kif->kf_un.kf_sock.kf_sock_domain0 = so->so_proto->pr_domain->dom_family; kif->kf_un.kf_sock.kf_sock_type0 = so->so_type; kif->kf_un.kf_sock.kf_sock_protocol0 = so->so_proto->pr_protocol; kif->kf_un.kf_sock.kf_sock_pcb = (uintptr_t)so->so_pcb; switch (kif->kf_un.kf_sock.kf_sock_domain0) { case AF_INET: case AF_INET6: if (kif->kf_un.kf_sock.kf_sock_protocol0 == IPPROTO_TCP) { if (so->so_pcb != NULL) { inpcb = (struct inpcb *)(so->so_pcb); kif->kf_un.kf_sock.kf_sock_inpcb = (uintptr_t)inpcb->inp_ppcb; kif->kf_un.kf_sock.kf_sock_sendq = sbused(&so->so_snd); kif->kf_un.kf_sock.kf_sock_recvq = sbused(&so->so_rcv); } } break; case AF_UNIX: if (so->so_pcb != NULL) { unpcb = (struct unpcb *)(so->so_pcb); if (unpcb->unp_conn) { kif->kf_un.kf_sock.kf_sock_unpconn = (uintptr_t)unpcb->unp_conn; kif->kf_un.kf_sock.kf_sock_rcv_sb_state = so->so_rcv.sb_state; kif->kf_un.kf_sock.kf_sock_snd_sb_state = so->so_snd.sb_state; kif->kf_un.kf_sock.kf_sock_sendq = sbused(&so->so_snd); kif->kf_un.kf_sock.kf_sock_recvq = sbused(&so->so_rcv); } } break; } error = so->so_proto->pr_usrreqs->pru_sockaddr(so, &sa); if (error == 0 && sa->sa_len <= sizeof(kif->kf_un.kf_sock.kf_sa_local)) { bcopy(sa, &kif->kf_un.kf_sock.kf_sa_local, sa->sa_len); free(sa, M_SONAME); } error = so->so_proto->pr_usrreqs->pru_peeraddr(so, &sa); if (error == 0 && sa->sa_len <= sizeof(kif->kf_un.kf_sock.kf_sa_peer)) { bcopy(sa, &kif->kf_un.kf_sock.kf_sa_peer, sa->sa_len); free(sa, M_SONAME); } strncpy(kif->kf_path, so->so_proto->pr_domain->dom_name, sizeof(kif->kf_path)); return (0); } /* * Use the 'backend3' field in AIO jobs to store the amount of data * completed by the AIO job so far. */ #define aio_done backend3 static STAILQ_HEAD(, task) soaio_jobs; static struct mtx soaio_jobs_lock; static struct task soaio_kproc_task; static int soaio_starting, soaio_idle, soaio_queued; static struct unrhdr *soaio_kproc_unr; static int soaio_max_procs = MAX_AIO_PROCS; SYSCTL_INT(_kern_ipc_aio, OID_AUTO, max_procs, CTLFLAG_RW, &soaio_max_procs, 0, "Maximum number of kernel processes to use for async socket IO"); static int soaio_num_procs; SYSCTL_INT(_kern_ipc_aio, OID_AUTO, num_procs, CTLFLAG_RD, &soaio_num_procs, 0, "Number of active kernel processes for async socket IO"); static int soaio_target_procs = TARGET_AIO_PROCS; SYSCTL_INT(_kern_ipc_aio, OID_AUTO, target_procs, CTLFLAG_RD, &soaio_target_procs, 0, "Preferred number of ready kernel processes for async socket IO"); static int soaio_lifetime; SYSCTL_INT(_kern_ipc_aio, OID_AUTO, lifetime, CTLFLAG_RW, &soaio_lifetime, 0, "Maximum lifetime for idle aiod"); static void soaio_kproc_loop(void *arg) { struct proc *p; struct vmspace *myvm; struct task *task; int error, id, pending; id = (intptr_t)arg; /* * Grab an extra reference on the daemon's vmspace so that it * doesn't get freed by jobs that switch to a different * vmspace. */ p = curproc; myvm = vmspace_acquire_ref(p); mtx_lock(&soaio_jobs_lock); MPASS(soaio_starting > 0); soaio_starting--; for (;;) { while (!STAILQ_EMPTY(&soaio_jobs)) { task = STAILQ_FIRST(&soaio_jobs); STAILQ_REMOVE_HEAD(&soaio_jobs, ta_link); soaio_queued--; pending = task->ta_pending; task->ta_pending = 0; mtx_unlock(&soaio_jobs_lock); task->ta_func(task->ta_context, pending); mtx_lock(&soaio_jobs_lock); } MPASS(soaio_queued == 0); if (p->p_vmspace != myvm) { mtx_unlock(&soaio_jobs_lock); vmspace_switch_aio(myvm); mtx_lock(&soaio_jobs_lock); continue; } soaio_idle++; error = mtx_sleep(&soaio_idle, &soaio_jobs_lock, 0, "-", soaio_lifetime); soaio_idle--; if (error == EWOULDBLOCK && STAILQ_EMPTY(&soaio_jobs) && soaio_num_procs > soaio_target_procs) break; } soaio_num_procs--; mtx_unlock(&soaio_jobs_lock); free_unr(soaio_kproc_unr, id); kproc_exit(0); } static void soaio_kproc_create(void *context, int pending) { struct proc *p; int error, id; mtx_lock(&soaio_jobs_lock); for (;;) { if (soaio_num_procs < soaio_target_procs) { /* Must create */ } else if (soaio_num_procs >= soaio_max_procs) { /* * Hit the limit on kernel processes, don't * create another one. */ break; } else if (soaio_queued <= soaio_idle + soaio_starting) { /* * No more AIO jobs waiting for a process to be * created, so stop. */ break; } soaio_starting++; mtx_unlock(&soaio_jobs_lock); id = alloc_unr(soaio_kproc_unr); error = kproc_create(soaio_kproc_loop, (void *)(intptr_t)id, &p, 0, 0, "soaiod%d", id); if (error != 0) { free_unr(soaio_kproc_unr, id); mtx_lock(&soaio_jobs_lock); soaio_starting--; break; } mtx_lock(&soaio_jobs_lock); soaio_num_procs++; } mtx_unlock(&soaio_jobs_lock); } void soaio_enqueue(struct task *task) { mtx_lock(&soaio_jobs_lock); MPASS(task->ta_pending == 0); task->ta_pending++; STAILQ_INSERT_TAIL(&soaio_jobs, task, ta_link); soaio_queued++; if (soaio_queued <= soaio_idle) wakeup_one(&soaio_idle); else if (soaio_num_procs < soaio_max_procs) taskqueue_enqueue(taskqueue_thread, &soaio_kproc_task); mtx_unlock(&soaio_jobs_lock); } static void soaio_init(void) { soaio_lifetime = AIOD_LIFETIME_DEFAULT; STAILQ_INIT(&soaio_jobs); mtx_init(&soaio_jobs_lock, "soaio jobs", NULL, MTX_DEF); soaio_kproc_unr = new_unrhdr(1, INT_MAX, NULL); TASK_INIT(&soaio_kproc_task, 0, soaio_kproc_create, NULL); if (soaio_target_procs > 0) taskqueue_enqueue(taskqueue_thread, &soaio_kproc_task); } SYSINIT(soaio, SI_SUB_VFS, SI_ORDER_ANY, soaio_init, NULL); static __inline int soaio_ready(struct socket *so, struct sockbuf *sb) { return (sb == &so->so_rcv ? soreadable(so) : sowriteable(so)); } static void soaio_process_job(struct socket *so, struct sockbuf *sb, struct kaiocb *job) { struct ucred *td_savedcred; struct thread *td; struct file *fp; struct uio uio; struct iovec iov; size_t cnt, done; long ru_before; int error, flags; SOCKBUF_UNLOCK(sb); aio_switch_vmspace(job); td = curthread; fp = job->fd_file; retry: td_savedcred = td->td_ucred; td->td_ucred = job->cred; done = job->aio_done; cnt = job->uaiocb.aio_nbytes - done; iov.iov_base = (void *)((uintptr_t)job->uaiocb.aio_buf + done); iov.iov_len = cnt; uio.uio_iov = &iov; uio.uio_iovcnt = 1; uio.uio_offset = 0; uio.uio_resid = cnt; uio.uio_segflg = UIO_USERSPACE; uio.uio_td = td; flags = MSG_NBIO; /* * For resource usage accounting, only count a completed request * as a single message to avoid counting multiple calls to * sosend/soreceive on a blocking socket. */ if (sb == &so->so_rcv) { uio.uio_rw = UIO_READ; ru_before = td->td_ru.ru_msgrcv; #ifdef MAC error = mac_socket_check_receive(fp->f_cred, so); if (error == 0) #endif error = soreceive(so, NULL, &uio, NULL, NULL, &flags); if (td->td_ru.ru_msgrcv != ru_before) job->msgrcv = 1; } else { if (!TAILQ_EMPTY(&sb->sb_aiojobq)) flags |= MSG_MORETOCOME; uio.uio_rw = UIO_WRITE; ru_before = td->td_ru.ru_msgsnd; #ifdef MAC error = mac_socket_check_send(fp->f_cred, so); if (error == 0) #endif error = sosend(so, NULL, &uio, NULL, NULL, flags, td); if (td->td_ru.ru_msgsnd != ru_before) job->msgsnd = 1; if (error == EPIPE && (so->so_options & SO_NOSIGPIPE) == 0) { PROC_LOCK(job->userproc); kern_psignal(job->userproc, SIGPIPE); PROC_UNLOCK(job->userproc); } } done += cnt - uio.uio_resid; job->aio_done = done; td->td_ucred = td_savedcred; if (error == EWOULDBLOCK) { /* * The request was either partially completed or not * completed at all due to racing with a read() or * write() on the socket. If the socket is * non-blocking, return with any partial completion. * If the socket is blocking or if no progress has * been made, requeue this request at the head of the * queue to try again when the socket is ready. */ MPASS(done != job->uaiocb.aio_nbytes); SOCKBUF_LOCK(sb); if (done == 0 || !(so->so_state & SS_NBIO)) { empty_results++; if (soaio_ready(so, sb)) { empty_retries++; SOCKBUF_UNLOCK(sb); goto retry; } if (!aio_set_cancel_function(job, soo_aio_cancel)) { SOCKBUF_UNLOCK(sb); if (done != 0) aio_complete(job, done, 0); else aio_cancel(job); SOCKBUF_LOCK(sb); } else { TAILQ_INSERT_HEAD(&sb->sb_aiojobq, job, list); } return; } SOCKBUF_UNLOCK(sb); } if (done != 0 && (error == ERESTART || error == EINTR || error == EWOULDBLOCK)) error = 0; if (error) aio_complete(job, -1, error); else aio_complete(job, done, 0); SOCKBUF_LOCK(sb); } static void soaio_process_sb(struct socket *so, struct sockbuf *sb) { struct kaiocb *job; SOCKBUF_LOCK(sb); while (!TAILQ_EMPTY(&sb->sb_aiojobq) && soaio_ready(so, sb)) { job = TAILQ_FIRST(&sb->sb_aiojobq); TAILQ_REMOVE(&sb->sb_aiojobq, job, list); if (!aio_clear_cancel_function(job)) continue; soaio_process_job(so, sb, job); } /* * If there are still pending requests, the socket must not be * ready so set SB_AIO to request a wakeup when the socket * becomes ready. */ if (!TAILQ_EMPTY(&sb->sb_aiojobq)) sb->sb_flags |= SB_AIO; sb->sb_flags &= ~SB_AIO_RUNNING; SOCKBUF_UNLOCK(sb); - ACCEPT_LOCK(); SOCK_LOCK(so); sorele(so); } void soaio_rcv(void *context, int pending) { struct socket *so; so = context; soaio_process_sb(so, &so->so_rcv); } void soaio_snd(void *context, int pending) { struct socket *so; so = context; soaio_process_sb(so, &so->so_snd); } void sowakeup_aio(struct socket *so, struct sockbuf *sb) { SOCKBUF_LOCK_ASSERT(sb); sb->sb_flags &= ~SB_AIO; if (sb->sb_flags & SB_AIO_RUNNING) return; sb->sb_flags |= SB_AIO_RUNNING; if (sb == &so->so_snd) SOCK_LOCK(so); soref(so); if (sb == &so->so_snd) SOCK_UNLOCK(so); soaio_enqueue(&sb->sb_aiotask); } static void soo_aio_cancel(struct kaiocb *job) { struct socket *so; struct sockbuf *sb; long done; int opcode; so = job->fd_file->f_data; opcode = job->uaiocb.aio_lio_opcode; if (opcode == LIO_READ) sb = &so->so_rcv; else { MPASS(opcode == LIO_WRITE); sb = &so->so_snd; } SOCKBUF_LOCK(sb); if (!aio_cancel_cleared(job)) TAILQ_REMOVE(&sb->sb_aiojobq, job, list); if (TAILQ_EMPTY(&sb->sb_aiojobq)) sb->sb_flags &= ~SB_AIO; SOCKBUF_UNLOCK(sb); done = job->aio_done; if (done != 0) aio_complete(job, done, 0); else aio_cancel(job); } static int soo_aio_queue(struct file *fp, struct kaiocb *job) { struct socket *so; struct sockbuf *sb; int error; so = fp->f_data; error = (*so->so_proto->pr_usrreqs->pru_aio_queue)(so, job); if (error == 0) return (0); switch (job->uaiocb.aio_lio_opcode) { case LIO_READ: sb = &so->so_rcv; break; case LIO_WRITE: sb = &so->so_snd; break; default: return (EINVAL); } SOCKBUF_LOCK(sb); if (!aio_set_cancel_function(job, soo_aio_cancel)) panic("new job was cancelled"); TAILQ_INSERT_TAIL(&sb->sb_aiojobq, job, list); if (!(sb->sb_flags & SB_AIO_RUNNING)) { if (soaio_ready(so, sb)) sowakeup_aio(so, sb); else sb->sb_flags |= SB_AIO; } SOCKBUF_UNLOCK(sb); return (0); } Index: head/sys/kern/uipc_accf.c =================================================================== --- head/sys/kern/uipc_accf.c (revision 319721) +++ head/sys/kern/uipc_accf.c (revision 319722) @@ -1,292 +1,306 @@ /*- * Copyright (c) 2000 Paycounter, Inc. * Copyright (c) 2005 Robert N. M. Watson * Author: Alfred Perlstein , * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #define ACCEPT_FILTER_MOD #include "opt_param.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include static struct mtx accept_filter_mtx; MTX_SYSINIT(accept_filter, &accept_filter_mtx, "accept_filter_mtx", MTX_DEF); #define ACCEPT_FILTER_LOCK() mtx_lock(&accept_filter_mtx) #define ACCEPT_FILTER_UNLOCK() mtx_unlock(&accept_filter_mtx) static SLIST_HEAD(, accept_filter) accept_filtlsthd = SLIST_HEAD_INITIALIZER(accept_filtlsthd); MALLOC_DEFINE(M_ACCF, "accf", "accept filter data"); static int unloadable = 0; SYSCTL_NODE(_net, OID_AUTO, accf, CTLFLAG_RW, 0, "Accept filters"); SYSCTL_INT(_net_accf, OID_AUTO, unloadable, CTLFLAG_RW, &unloadable, 0, "Allow unload of accept filters (not recommended)"); /* * Must be passed a malloc'd structure so we don't explode if the kld is * unloaded, we leak the struct on deallocation to deal with this, but if a * filter is loaded with the same name as a leaked one we re-use the entry. */ int accept_filt_add(struct accept_filter *filt) { struct accept_filter *p; ACCEPT_FILTER_LOCK(); SLIST_FOREACH(p, &accept_filtlsthd, accf_next) if (strcmp(p->accf_name, filt->accf_name) == 0) { if (p->accf_callback != NULL) { ACCEPT_FILTER_UNLOCK(); return (EEXIST); } else { p->accf_callback = filt->accf_callback; ACCEPT_FILTER_UNLOCK(); free(filt, M_ACCF); return (0); } } if (p == NULL) SLIST_INSERT_HEAD(&accept_filtlsthd, filt, accf_next); ACCEPT_FILTER_UNLOCK(); return (0); } int accept_filt_del(char *name) { struct accept_filter *p; p = accept_filt_get(name); if (p == NULL) return (ENOENT); p->accf_callback = NULL; return (0); } struct accept_filter * accept_filt_get(char *name) { struct accept_filter *p; ACCEPT_FILTER_LOCK(); SLIST_FOREACH(p, &accept_filtlsthd, accf_next) if (strcmp(p->accf_name, name) == 0) break; ACCEPT_FILTER_UNLOCK(); return (p); } int accept_filt_generic_mod_event(module_t mod, int event, void *data) { struct accept_filter *p; struct accept_filter *accfp = (struct accept_filter *) data; int error; switch (event) { case MOD_LOAD: p = malloc(sizeof(*p), M_ACCF, M_WAITOK); bcopy(accfp, p, sizeof(*p)); error = accept_filt_add(p); break; case MOD_UNLOAD: /* * Do not support unloading yet. we don't keep track of * refcounts and unloading an accept filter callback and then * having it called is a bad thing. A simple fix would be to * track the refcount in the struct accept_filter. */ if (unloadable != 0) { error = accept_filt_del(accfp->accf_name); } else error = EOPNOTSUPP; break; case MOD_SHUTDOWN: error = 0; break; default: error = EOPNOTSUPP; break; } return (error); } int accept_filt_getopt(struct socket *so, struct sockopt *sopt) { struct accept_filter_arg *afap; int error; error = 0; afap = malloc(sizeof(*afap), M_TEMP, M_WAITOK | M_ZERO); SOCK_LOCK(so); if ((so->so_options & SO_ACCEPTCONN) == 0) { error = EINVAL; goto out; } - if ((so->so_options & SO_ACCEPTFILTER) == 0) { + if (so->sol_accept_filter == NULL) { error = EINVAL; goto out; } - strcpy(afap->af_name, so->so_accf->so_accept_filter->accf_name); - if (so->so_accf->so_accept_filter_str != NULL) - strcpy(afap->af_arg, so->so_accf->so_accept_filter_str); + strcpy(afap->af_name, so->sol_accept_filter->accf_name); + if (so->sol_accept_filter_str != NULL) + strcpy(afap->af_arg, so->sol_accept_filter_str); out: SOCK_UNLOCK(so); if (error == 0) error = sooptcopyout(sopt, afap, sizeof(*afap)); free(afap, M_TEMP); return (error); } int accept_filt_setopt(struct socket *so, struct sockopt *sopt) { struct accept_filter_arg *afap; struct accept_filter *afp; - struct so_accf *newaf; - int error = 0; + char *accept_filter_str = NULL; + void *accept_filter_arg = NULL; + int error; /* * Handle the simple delete case first. */ if (sopt == NULL || sopt->sopt_val == NULL) { + struct socket *sp, *sp1; + int wakeup; + SOCK_LOCK(so); if ((so->so_options & SO_ACCEPTCONN) == 0) { SOCK_UNLOCK(so); return (EINVAL); } - if (so->so_accf != NULL) { - struct so_accf *af = so->so_accf; - if (af->so_accept_filter != NULL && - af->so_accept_filter->accf_destroy != NULL) { - af->so_accept_filter->accf_destroy(so); - } - if (af->so_accept_filter_str != NULL) - free(af->so_accept_filter_str, M_ACCF); - free(af, M_ACCF); - so->so_accf = NULL; + if (so->sol_accept_filter == NULL) { + SOCK_UNLOCK(so); + return (0); } + if (so->sol_accept_filter->accf_destroy != NULL) + so->sol_accept_filter->accf_destroy(so); + if (so->sol_accept_filter_str != NULL) + free(so->sol_accept_filter_str, M_ACCF); + so->sol_accept_filter = NULL; + so->sol_accept_filter_arg = NULL; + so->sol_accept_filter_str = NULL; so->so_options &= ~SO_ACCEPTFILTER; - SOCK_UNLOCK(so); + + /* + * Move from incomplete queue to complete only those + * connections, that are blocked by us. + */ + wakeup = 0; + TAILQ_FOREACH_SAFE(sp, &so->sol_incomp, so_list, sp1) { + SOCK_LOCK(sp); + if (sp->so_options & SO_ACCEPTFILTER) { + TAILQ_REMOVE(&so->sol_incomp, sp, so_list); + TAILQ_INSERT_TAIL(&so->sol_comp, sp, so_list); + sp->so_qstate = SQ_COMP; + sp->so_options &= ~SO_ACCEPTFILTER; + so->sol_incqlen--; + so->sol_qlen++; + wakeup = 1; + } + SOCK_UNLOCK(sp); + } + if (wakeup) + solisten_wakeup(so); /* unlocks */ + else + SOLISTEN_UNLOCK(so); return (0); } /* * Pre-allocate any memory we may need later to avoid blocking at * untimely moments. This does not optimize for invalid arguments. */ afap = malloc(sizeof(*afap), M_TEMP, M_WAITOK); error = sooptcopyin(sopt, afap, sizeof *afap, sizeof *afap); afap->af_name[sizeof(afap->af_name)-1] = '\0'; afap->af_arg[sizeof(afap->af_arg)-1] = '\0'; if (error) { free(afap, M_TEMP); return (error); } afp = accept_filt_get(afap->af_name); if (afp == NULL) { free(afap, M_TEMP); return (ENOENT); } - /* - * Allocate the new accept filter instance storage. We may - * have to free it again later if we fail to attach it. If - * attached properly, 'newaf' is NULLed to avoid a free() - * while in use. - */ - newaf = malloc(sizeof(*newaf), M_ACCF, M_WAITOK | M_ZERO); if (afp->accf_create != NULL && afap->af_name[0] != '\0') { size_t len = strlen(afap->af_name) + 1; - newaf->so_accept_filter_str = malloc(len, M_ACCF, M_WAITOK); - strcpy(newaf->so_accept_filter_str, afap->af_name); + accept_filter_str = malloc(len, M_ACCF, M_WAITOK); + strcpy(accept_filter_str, afap->af_name); } /* * Require a listen socket; don't try to replace an existing filter * without first removing it. */ SOCK_LOCK(so); - if (((so->so_options & SO_ACCEPTCONN) == 0) || - (so->so_accf != NULL)) { + if ((so->so_options & SO_ACCEPTCONN) == 0 || + so->sol_accept_filter != NULL) { error = EINVAL; goto out; } /* * Invoke the accf_create() method of the filter if required. The * socket mutex is held over this call, so create methods for filters * can't block. */ if (afp->accf_create != NULL) { - newaf->so_accept_filter_arg = - afp->accf_create(so, afap->af_arg); - if (newaf->so_accept_filter_arg == NULL) { + accept_filter_arg = afp->accf_create(so, afap->af_arg); + if (accept_filter_arg == NULL) { error = EINVAL; goto out; } } - newaf->so_accept_filter = afp; - so->so_accf = newaf; + so->sol_accept_filter = afp; + so->sol_accept_filter_arg = accept_filter_arg; + so->sol_accept_filter_str = accept_filter_str; so->so_options |= SO_ACCEPTFILTER; - newaf = NULL; out: SOCK_UNLOCK(so); - if (newaf != NULL) { - if (newaf->so_accept_filter_str != NULL) - free(newaf->so_accept_filter_str, M_ACCF); - free(newaf, M_ACCF); - } - if (afap != NULL) - free(afap, M_TEMP); + if (accept_filter_str != NULL) + free(accept_filter_str, M_ACCF); + free(afap, M_TEMP); return (error); } Index: head/sys/kern/uipc_debug.c =================================================================== --- head/sys/kern/uipc_debug.c (revision 319721) +++ head/sys/kern/uipc_debug.c (revision 319722) @@ -1,532 +1,534 @@ /*- * Copyright (c) 2007 Robert N. M. Watson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * Debugger routines relating to sockets, protocols, etc, for use in DDB. */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include #include #include #include #include #include #ifdef DDB #include static void db_print_sotype(short so_type) { switch (so_type) { case SOCK_STREAM: db_printf("SOCK_STREAM"); break; case SOCK_DGRAM: db_printf("SOCK_DGRAM"); break; case SOCK_RAW: db_printf("SOCK_RAW"); break; case SOCK_RDM: db_printf("SOCK_RDM"); break; case SOCK_SEQPACKET: db_printf("SOCK_SEQPACKET"); break; default: db_printf("unknown"); break; } } static void db_print_sooptions(short so_options) { int comma; comma = 0; if (so_options & SO_DEBUG) { db_printf("%sSO_DEBUG", comma ? ", " : ""); comma = 1; } if (so_options & SO_ACCEPTCONN) { db_printf("%sSO_ACCEPTCONN", comma ? ", " : ""); comma = 1; } if (so_options & SO_REUSEADDR) { db_printf("%sSO_REUSEADDR", comma ? ", " : ""); comma = 1; } if (so_options & SO_KEEPALIVE) { db_printf("%sSO_KEEPALIVE", comma ? ", " : ""); comma = 1; } if (so_options & SO_DONTROUTE) { db_printf("%sSO_DONTROUTE", comma ? ", " : ""); comma = 1; } if (so_options & SO_BROADCAST) { db_printf("%sSO_BROADCAST", comma ? ", " : ""); comma = 1; } if (so_options & SO_USELOOPBACK) { db_printf("%sSO_USELOOPBACK", comma ? ", " : ""); comma = 1; } if (so_options & SO_LINGER) { db_printf("%sSO_LINGER", comma ? ", " : ""); comma = 1; } if (so_options & SO_OOBINLINE) { db_printf("%sSO_OOBINLINE", comma ? ", " : ""); comma = 1; } if (so_options & SO_REUSEPORT) { db_printf("%sSO_REUSEPORT", comma ? ", " : ""); comma = 1; } if (so_options & SO_TIMESTAMP) { db_printf("%sSO_TIMESTAMP", comma ? ", " : ""); comma = 1; } if (so_options & SO_NOSIGPIPE) { db_printf("%sSO_NOSIGPIPE", comma ? ", " : ""); comma = 1; } if (so_options & SO_ACCEPTFILTER) { db_printf("%sSO_ACCEPTFILTER", comma ? ", " : ""); comma = 1; } if (so_options & SO_BINTIME) { db_printf("%sSO_BINTIME", comma ? ", " : ""); comma = 1; } if (so_options & SO_NO_OFFLOAD) { db_printf("%sSO_NO_OFFLOAD", comma ? ", " : ""); comma = 1; } if (so_options & SO_NO_DDP) { db_printf("%sSO_NO_DDP", comma ? ", " : ""); comma = 1; } } static void db_print_sostate(short so_state) { int comma; comma = 0; if (so_state & SS_NOFDREF) { db_printf("%sSS_NOFDREF", comma ? ", " : ""); comma = 1; } if (so_state & SS_ISCONNECTED) { db_printf("%sSS_ISCONNECTED", comma ? ", " : ""); comma = 1; } if (so_state & SS_ISCONNECTING) { db_printf("%sSS_ISCONNECTING", comma ? ", " : ""); comma = 1; } if (so_state & SS_ISDISCONNECTING) { db_printf("%sSS_ISDISCONNECTING", comma ? ", " : ""); comma = 1; } if (so_state & SS_NBIO) { db_printf("%sSS_NBIO", comma ? ", " : ""); comma = 1; } if (so_state & SS_ASYNC) { db_printf("%sSS_ASYNC", comma ? ", " : ""); comma = 1; } if (so_state & SS_ISCONFIRMING) { db_printf("%sSS_ISCONFIRMING", comma ? ", " : ""); comma = 1; } if (so_state & SS_PROTOREF) { db_printf("%sSS_PROTOREF", comma ? ", " : ""); comma = 1; } } static void db_print_soqstate(int so_qstate) { int comma; comma = 0; if (so_qstate & SQ_INCOMP) { db_printf("%sSQ_INCOMP", comma ? ", " : ""); comma = 1; } if (so_qstate & SQ_COMP) { db_printf("%sSQ_COMP", comma ? ", " : ""); comma = 1; } } static void db_print_sbstate(short sb_state) { int comma; comma = 0; if (sb_state & SBS_CANTSENDMORE) { db_printf("%sSBS_CANTSENDMORE", comma ? ", " : ""); comma = 1; } if (sb_state & SBS_CANTRCVMORE) { db_printf("%sSBS_CANTRCVMORE", comma ? ", " : ""); comma = 1; } if (sb_state & SBS_RCVATMARK) { db_printf("%sSBS_RCVATMARK", comma ? ", " : ""); comma = 1; } } static void db_print_indent(int indent) { int i; for (i = 0; i < indent; i++) db_printf(" "); } static void db_print_domain(struct domain *d, const char *domain_name, int indent) { db_print_indent(indent); db_printf("%s at %p\n", domain_name, d); indent += 2; db_print_indent(indent); db_printf("dom_family: %d ", d->dom_family); db_printf("dom_name: %s\n", d->dom_name); db_print_indent(indent); db_printf("dom_init: %p ", d->dom_init); db_printf("dom_externalize: %p ", d->dom_externalize); db_printf("dom_dispose: %p\n", d->dom_dispose); db_print_indent(indent); db_printf("dom_protosw: %p ", d->dom_protosw); db_printf("dom_next: %p\n", d->dom_next); db_print_indent(indent); db_printf("dom_rtattach: %p ", d->dom_rtattach); db_print_indent(indent); db_printf("dom_ifattach: %p ", d->dom_ifattach); db_printf("dom_ifdetach: %p\n", d->dom_ifdetach); } static void db_print_prflags(short pr_flags) { int comma; comma = 0; if (pr_flags & PR_ATOMIC) { db_printf("%sPR_ATOMIC", comma ? ", " : ""); comma = 1; } if (pr_flags & PR_ADDR) { db_printf("%sPR_ADDR", comma ? ", " : ""); comma = 1; } if (pr_flags & PR_CONNREQUIRED) { db_printf("%sPR_CONNREQUIRED", comma ? ", " : ""); comma = 1; } if (pr_flags & PR_WANTRCVD) { db_printf("%sPR_WANTRCVD", comma ? ", " : ""); comma = 1; } if (pr_flags & PR_RIGHTS) { db_printf("%sPR_RIGHTS", comma ? ", " : ""); comma = 1; } if (pr_flags & PR_IMPLOPCL) { db_printf("%sPR_IMPLOPCL", comma ? ", " : ""); comma = 1; } if (pr_flags & PR_LASTHDR) { db_printf("%sPR_LASTHDR", comma ? ", " : ""); comma = 1; } } static void db_print_protosw(struct protosw *pr, const char *prname, int indent) { db_print_indent(indent); db_printf("%s at %p\n", prname, pr); indent += 2; db_print_indent(indent); db_printf("pr_type: %d ", pr->pr_type); db_printf("pr_domain: %p\n", pr->pr_domain); if (pr->pr_domain != NULL) db_print_domain(pr->pr_domain, "pr_domain", indent); db_print_indent(indent); db_printf("pr_protocol: %d\n", pr->pr_protocol); db_print_indent(indent); db_printf("pr_flags: %d (", pr->pr_flags); db_print_prflags(pr->pr_flags); db_printf(")\n"); db_print_indent(indent); db_printf("pr_input: %p ", pr->pr_input); db_printf("pr_output: %p ", pr->pr_output); db_printf("pr_ctlinput: %p\n", pr->pr_ctlinput); db_print_indent(indent); db_printf("pr_ctloutput: %p ", pr->pr_ctloutput); db_printf("pr_init: %p\n", pr->pr_init); db_print_indent(indent); db_printf("pr_fasttimo: %p ", pr->pr_fasttimo); db_printf("pr_slowtimo: %p ", pr->pr_slowtimo); db_printf("pr_drain: %p\n", pr->pr_drain); db_print_indent(indent); } static void db_print_sbflags(short sb_flags) { int comma; comma = 0; if (sb_flags & SB_WAIT) { db_printf("%sSB_WAIT", comma ? ", " : ""); comma = 1; } if (sb_flags & SB_SEL) { db_printf("%sSB_SEL", comma ? ", " : ""); comma = 1; } if (sb_flags & SB_ASYNC) { db_printf("%sSB_ASYNC", comma ? ", " : ""); comma = 1; } if (sb_flags & SB_UPCALL) { db_printf("%sSB_UPCALL", comma ? ", " : ""); comma = 1; } if (sb_flags & SB_NOINTR) { db_printf("%sSB_NOINTR", comma ? ", " : ""); comma = 1; } if (sb_flags & SB_AIO) { db_printf("%sSB_AIO", comma ? ", " : ""); comma = 1; } if (sb_flags & SB_KNOTE) { db_printf("%sSB_KNOTE", comma ? ", " : ""); comma = 1; } if (sb_flags & SB_AUTOSIZE) { db_printf("%sSB_AUTOSIZE", comma ? ", " : ""); comma = 1; } } static void db_print_sockbuf(struct sockbuf *sb, const char *sockbufname, int indent) { db_print_indent(indent); db_printf("%s at %p\n", sockbufname, sb); indent += 2; db_print_indent(indent); db_printf("sb_state: 0x%x (", sb->sb_state); db_print_sbstate(sb->sb_state); db_printf(")\n"); db_print_indent(indent); db_printf("sb_mb: %p ", sb->sb_mb); db_printf("sb_mbtail: %p ", sb->sb_mbtail); db_printf("sb_lastrecord: %p\n", sb->sb_lastrecord); db_print_indent(indent); db_printf("sb_sndptr: %p ", sb->sb_sndptr); db_printf("sb_sndptroff: %u\n", sb->sb_sndptroff); db_print_indent(indent); db_printf("sb_acc: %u ", sb->sb_acc); db_printf("sb_ccc: %u ", sb->sb_ccc); db_printf("sb_hiwat: %u ", sb->sb_hiwat); db_printf("sb_mbcnt: %u ", sb->sb_mbcnt); db_printf("sb_mbmax: %u\n", sb->sb_mbmax); db_print_indent(indent); db_printf("sb_ctl: %u ", sb->sb_ctl); db_printf("sb_lowat: %d ", sb->sb_lowat); db_printf("sb_timeo: %jd\n", sb->sb_timeo); db_print_indent(indent); db_printf("sb_flags: 0x%x (", sb->sb_flags); db_print_sbflags(sb->sb_flags); db_printf(")\n"); db_print_indent(indent); db_printf("sb_aiojobq first: %p\n", TAILQ_FIRST(&sb->sb_aiojobq)); } static void db_print_socket(struct socket *so, const char *socketname, int indent) { db_print_indent(indent); db_printf("%s at %p\n", socketname, so); indent += 2; db_print_indent(indent); db_printf("so_count: %d ", so->so_count); db_printf("so_type: %d (", so->so_type); db_print_sotype(so->so_type); db_printf(")\n"); db_print_indent(indent); db_printf("so_options: 0x%x (", so->so_options); db_print_sooptions(so->so_options); db_printf(")\n"); db_print_indent(indent); db_printf("so_linger: %d ", so->so_linger); db_printf("so_state: 0x%x (", so->so_state); db_print_sostate(so->so_state); db_printf(")\n"); db_print_indent(indent); - db_printf("so_qstate: 0x%x (", so->so_qstate); - db_print_soqstate(so->so_qstate); db_printf(") "); db_printf("so_pcb: %p ", so->so_pcb); db_printf("so_proto: %p\n", so->so_proto); if (so->so_proto != NULL) db_print_protosw(so->so_proto, "so_proto", indent); db_print_indent(indent); - db_printf("so_head: %p ", so->so_head); - db_printf("so_incomp first: %p ", TAILQ_FIRST(&so->so_incomp)); - db_printf("so_comp first: %p\n", TAILQ_FIRST(&so->so_comp)); + if (so->so_options & SO_ACCEPTCONN) { + db_printf("sol_incomp first: %p ", + TAILQ_FIRST(&so->sol_incomp)); + db_printf("sol_comp first: %p\n", TAILQ_FIRST(&so->sol_comp)); + db_printf("sol_qlen: %d ", so->sol_qlen); + db_printf("sol_incqlen: %d ", so->sol_incqlen); + db_printf("sol_qlimit: %d ", so->sol_qlimit); + } else { + db_printf("so_qstate: 0x%x (", so->so_qstate); + db_print_soqstate(so->so_qstate); + db_printf("so_listen: %p ", so->so_listen); + /* so_list skipped */ + db_printf("so_timeo: %d ", so->so_timeo); + db_printf("so_error: %d\n", so->so_error); - db_print_indent(indent); - /* so_list skipped */ - db_printf("so_qlen: %u ", so->so_qlen); - db_printf("so_incqlen: %u ", so->so_incqlen); - db_printf("so_qlimit: %u ", so->so_qlimit); - db_printf("so_timeo: %d ", so->so_timeo); - db_printf("so_error: %d\n", so->so_error); + db_print_indent(indent); + db_printf("so_sigio: %p ", so->so_sigio); + db_printf("so_oobmark: %lu ", so->so_oobmark); - db_print_indent(indent); - db_printf("so_sigio: %p ", so->so_sigio); - db_printf("so_oobmark: %lu ", so->so_oobmark); - - db_print_sockbuf(&so->so_rcv, "so_rcv", indent); - db_print_sockbuf(&so->so_snd, "so_snd", indent); + db_print_sockbuf(&so->so_rcv, "so_rcv", indent); + db_print_sockbuf(&so->so_snd, "so_snd", indent); + } } DB_SHOW_COMMAND(socket, db_show_socket) { struct socket *so; if (!have_addr) { db_printf("usage: show socket \n"); return; } so = (struct socket *)addr; db_print_socket(so, "socket", 0); } DB_SHOW_COMMAND(sockbuf, db_show_sockbuf) { struct sockbuf *sb; if (!have_addr) { db_printf("usage: show sockbuf \n"); return; } sb = (struct sockbuf *)addr; db_print_sockbuf(sb, "sockbuf", 0); } DB_SHOW_COMMAND(protosw, db_show_protosw) { struct protosw *pr; if (!have_addr) { db_printf("usage: show protosw \n"); return; } pr = (struct protosw *)addr; db_print_protosw(pr, "protosw", 0); } DB_SHOW_COMMAND(domain, db_show_domain) { struct domain *d; if (!have_addr) { db_printf("usage: show protosw \n"); return; } d = (struct domain *)addr; db_print_domain(d, "domain", 0); } #endif Index: head/sys/kern/uipc_sockbuf.c =================================================================== --- head/sys/kern/uipc_sockbuf.c (revision 319721) +++ head/sys/kern/uipc_sockbuf.c (revision 319722) @@ -1,1349 +1,1349 @@ /*- * Copyright (c) 1982, 1986, 1988, 1990, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)uipc_socket2.c 8.1 (Berkeley) 6/10/93 */ #include __FBSDID("$FreeBSD$"); #include "opt_param.h" #include #include /* for aio_swake proto */ #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Function pointer set by the AIO routines so that the socket buffer code * can call back into the AIO module if it is loaded. */ void (*aio_swake)(struct socket *, struct sockbuf *); /* * Primitive routines for operating on socket buffers */ u_long sb_max = SB_MAX; u_long sb_max_adj = (quad_t)SB_MAX * MCLBYTES / (MSIZE + MCLBYTES); /* adjusted sb_max */ static u_long sb_efficiency = 8; /* parameter for sbreserve() */ static struct mbuf *sbcut_internal(struct sockbuf *sb, int len); static void sbflush_internal(struct sockbuf *sb); /* * Our own version of m_clrprotoflags(), that can preserve M_NOTREADY. */ static void sbm_clrprotoflags(struct mbuf *m, int flags) { int mask; mask = ~M_PROTOFLAGS; if (flags & PRUS_NOTREADY) mask |= M_NOTREADY; while (m) { m->m_flags &= mask; m = m->m_next; } } /* * Mark ready "count" mbufs starting with "m". */ int sbready(struct sockbuf *sb, struct mbuf *m, int count) { u_int blocker; SOCKBUF_LOCK_ASSERT(sb); KASSERT(sb->sb_fnrdy != NULL, ("%s: sb %p NULL fnrdy", __func__, sb)); blocker = (sb->sb_fnrdy == m) ? M_BLOCKED : 0; for (int i = 0; i < count; i++, m = m->m_next) { KASSERT(m->m_flags & M_NOTREADY, ("%s: m %p !M_NOTREADY", __func__, m)); m->m_flags &= ~(M_NOTREADY | blocker); if (blocker) sb->sb_acc += m->m_len; } if (!blocker) return (EINPROGRESS); /* This one was blocking all the queue. */ for (; m && (m->m_flags & M_NOTREADY) == 0; m = m->m_next) { KASSERT(m->m_flags & M_BLOCKED, ("%s: m %p !M_BLOCKED", __func__, m)); m->m_flags &= ~M_BLOCKED; sb->sb_acc += m->m_len; } sb->sb_fnrdy = m; return (0); } /* * Adjust sockbuf state reflecting allocation of m. */ void sballoc(struct sockbuf *sb, struct mbuf *m) { SOCKBUF_LOCK_ASSERT(sb); sb->sb_ccc += m->m_len; if (sb->sb_fnrdy == NULL) { if (m->m_flags & M_NOTREADY) sb->sb_fnrdy = m; else sb->sb_acc += m->m_len; } else m->m_flags |= M_BLOCKED; if (m->m_type != MT_DATA && m->m_type != MT_OOBDATA) sb->sb_ctl += m->m_len; sb->sb_mbcnt += MSIZE; sb->sb_mcnt += 1; if (m->m_flags & M_EXT) { sb->sb_mbcnt += m->m_ext.ext_size; sb->sb_ccnt += 1; } } /* * Adjust sockbuf state reflecting freeing of m. */ void sbfree(struct sockbuf *sb, struct mbuf *m) { #if 0 /* XXX: not yet: soclose() call path comes here w/o lock. */ SOCKBUF_LOCK_ASSERT(sb); #endif sb->sb_ccc -= m->m_len; if (!(m->m_flags & M_NOTAVAIL)) sb->sb_acc -= m->m_len; if (m == sb->sb_fnrdy) { struct mbuf *n; KASSERT(m->m_flags & M_NOTREADY, ("%s: m %p !M_NOTREADY", __func__, m)); n = m->m_next; while (n != NULL && !(n->m_flags & M_NOTREADY)) { n->m_flags &= ~M_BLOCKED; sb->sb_acc += n->m_len; n = n->m_next; } sb->sb_fnrdy = n; } if (m->m_type != MT_DATA && m->m_type != MT_OOBDATA) sb->sb_ctl -= m->m_len; sb->sb_mbcnt -= MSIZE; sb->sb_mcnt -= 1; if (m->m_flags & M_EXT) { sb->sb_mbcnt -= m->m_ext.ext_size; sb->sb_ccnt -= 1; } if (sb->sb_sndptr == m) { sb->sb_sndptr = NULL; sb->sb_sndptroff = 0; } if (sb->sb_sndptroff != 0) sb->sb_sndptroff -= m->m_len; } /* * Socantsendmore indicates that no more data will be sent on the socket; it * would normally be applied to a socket when the user informs the system * that no more data is to be sent, by the protocol code (in case * PRU_SHUTDOWN). Socantrcvmore indicates that no more data will be * received, and will normally be applied to the socket by a protocol when it * detects that the peer will send no more data. Data queued for reading in * the socket may yet be read. */ void socantsendmore_locked(struct socket *so) { SOCKBUF_LOCK_ASSERT(&so->so_snd); so->so_snd.sb_state |= SBS_CANTSENDMORE; sowwakeup_locked(so); mtx_assert(SOCKBUF_MTX(&so->so_snd), MA_NOTOWNED); } void socantsendmore(struct socket *so) { SOCKBUF_LOCK(&so->so_snd); socantsendmore_locked(so); mtx_assert(SOCKBUF_MTX(&so->so_snd), MA_NOTOWNED); } void socantrcvmore_locked(struct socket *so) { SOCKBUF_LOCK_ASSERT(&so->so_rcv); so->so_rcv.sb_state |= SBS_CANTRCVMORE; sorwakeup_locked(so); mtx_assert(SOCKBUF_MTX(&so->so_rcv), MA_NOTOWNED); } void socantrcvmore(struct socket *so) { SOCKBUF_LOCK(&so->so_rcv); socantrcvmore_locked(so); mtx_assert(SOCKBUF_MTX(&so->so_rcv), MA_NOTOWNED); } /* * Wait for data to arrive at/drain from a socket buffer. */ int sbwait(struct sockbuf *sb) { SOCKBUF_LOCK_ASSERT(sb); sb->sb_flags |= SB_WAIT; return (msleep_sbt(&sb->sb_acc, &sb->sb_mtx, (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK | PCATCH, "sbwait", sb->sb_timeo, 0, 0)); } int sblock(struct sockbuf *sb, int flags) { KASSERT((flags & SBL_VALID) == flags, ("sblock: flags invalid (0x%x)", flags)); if (flags & SBL_WAIT) { if ((sb->sb_flags & SB_NOINTR) || (flags & SBL_NOINTR)) { sx_xlock(&sb->sb_sx); return (0); } return (sx_xlock_sig(&sb->sb_sx)); } else { if (sx_try_xlock(&sb->sb_sx) == 0) return (EWOULDBLOCK); return (0); } } void sbunlock(struct sockbuf *sb) { sx_xunlock(&sb->sb_sx); } /* * Wakeup processes waiting on a socket buffer. Do asynchronous notification * via SIGIO if the socket has the SS_ASYNC flag set. * * Called with the socket buffer lock held; will release the lock by the end * of the function. This allows the caller to acquire the socket buffer lock * while testing for the need for various sorts of wakeup and hold it through * to the point where it's no longer required. We currently hold the lock * through calls out to other subsystems (with the exception of kqueue), and * then release it to avoid lock order issues. It's not clear that's * correct. */ void sowakeup(struct socket *so, struct sockbuf *sb) { int ret; SOCKBUF_LOCK_ASSERT(sb); - selwakeuppri(&sb->sb_sel, PSOCK); - if (!SEL_WAITING(&sb->sb_sel)) + selwakeuppri(sb->sb_sel, PSOCK); + if (!SEL_WAITING(sb->sb_sel)) sb->sb_flags &= ~SB_SEL; if (sb->sb_flags & SB_WAIT) { sb->sb_flags &= ~SB_WAIT; wakeup(&sb->sb_acc); } - KNOTE_LOCKED(&sb->sb_sel.si_note, 0); + KNOTE_LOCKED(&sb->sb_sel->si_note, 0); if (sb->sb_upcall != NULL && !(so->so_state & SS_ISDISCONNECTED)) { ret = sb->sb_upcall(so, sb->sb_upcallarg, M_NOWAIT); if (ret == SU_ISCONNECTED) { KASSERT(sb == &so->so_rcv, ("SO_SND upcall returned SU_ISCONNECTED")); soupcall_clear(so, SO_RCV); } } else ret = SU_OK; if (sb->sb_flags & SB_AIO) sowakeup_aio(so, sb); SOCKBUF_UNLOCK(sb); if (ret == SU_ISCONNECTED) soisconnected(so); if ((so->so_state & SS_ASYNC) && so->so_sigio != NULL) pgsigio(&so->so_sigio, SIGIO, 0); mtx_assert(SOCKBUF_MTX(sb), MA_NOTOWNED); } /* * Socket buffer (struct sockbuf) utility routines. * * Each socket contains two socket buffers: one for sending data and one for * receiving data. Each buffer contains a queue of mbufs, information about * the number of mbufs and amount of data in the queue, and other fields * allowing select() statements and notification on data availability to be * implemented. * * Data stored in a socket buffer is maintained as a list of records. Each * record is a list of mbufs chained together with the m_next field. Records * are chained together with the m_nextpkt field. The upper level routine * soreceive() expects the following conventions to be observed when placing * information in the receive buffer: * * 1. If the protocol requires each message be preceded by the sender's name, * then a record containing that name must be present before any * associated data (mbuf's must be of type MT_SONAME). * 2. If the protocol supports the exchange of ``access rights'' (really just * additional data associated with the message), and there are ``rights'' * to be received, then a record containing this data should be present * (mbuf's must be of type MT_RIGHTS). * 3. If a name or rights record exists, then it must be followed by a data * record, perhaps of zero length. * * Before using a new socket structure it is first necessary to reserve * buffer space to the socket, by calling sbreserve(). This should commit * some of the available buffer space in the system buffer pool for the * socket (currently, it does nothing but enforce limits). The space should * be released by calling sbrelease() when the socket is destroyed. */ int soreserve(struct socket *so, u_long sndcc, u_long rcvcc) { struct thread *td = curthread; SOCKBUF_LOCK(&so->so_snd); SOCKBUF_LOCK(&so->so_rcv); if (sbreserve_locked(&so->so_snd, sndcc, so, td) == 0) goto bad; if (sbreserve_locked(&so->so_rcv, rcvcc, so, td) == 0) goto bad2; if (so->so_rcv.sb_lowat == 0) so->so_rcv.sb_lowat = 1; if (so->so_snd.sb_lowat == 0) so->so_snd.sb_lowat = MCLBYTES; if (so->so_snd.sb_lowat > so->so_snd.sb_hiwat) so->so_snd.sb_lowat = so->so_snd.sb_hiwat; SOCKBUF_UNLOCK(&so->so_rcv); SOCKBUF_UNLOCK(&so->so_snd); return (0); bad2: sbrelease_locked(&so->so_snd, so); bad: SOCKBUF_UNLOCK(&so->so_rcv); SOCKBUF_UNLOCK(&so->so_snd); return (ENOBUFS); } static int sysctl_handle_sb_max(SYSCTL_HANDLER_ARGS) { int error = 0; u_long tmp_sb_max = sb_max; error = sysctl_handle_long(oidp, &tmp_sb_max, arg2, req); if (error || !req->newptr) return (error); if (tmp_sb_max < MSIZE + MCLBYTES) return (EINVAL); sb_max = tmp_sb_max; sb_max_adj = (u_quad_t)sb_max * MCLBYTES / (MSIZE + MCLBYTES); return (0); } /* * Allot mbufs to a sockbuf. Attempt to scale mbmax so that mbcnt doesn't * become limiting if buffering efficiency is near the normal case. */ int sbreserve_locked(struct sockbuf *sb, u_long cc, struct socket *so, struct thread *td) { rlim_t sbsize_limit; SOCKBUF_LOCK_ASSERT(sb); /* * When a thread is passed, we take into account the thread's socket * buffer size limit. The caller will generally pass curthread, but * in the TCP input path, NULL will be passed to indicate that no * appropriate thread resource limits are available. In that case, * we don't apply a process limit. */ if (cc > sb_max_adj) return (0); if (td != NULL) { sbsize_limit = lim_cur(td, RLIMIT_SBSIZE); } else sbsize_limit = RLIM_INFINITY; if (!chgsbsize(so->so_cred->cr_uidinfo, &sb->sb_hiwat, cc, sbsize_limit)) return (0); sb->sb_mbmax = min(cc * sb_efficiency, sb_max); if (sb->sb_lowat > sb->sb_hiwat) sb->sb_lowat = sb->sb_hiwat; return (1); } int sbreserve(struct sockbuf *sb, u_long cc, struct socket *so, struct thread *td) { int error; SOCKBUF_LOCK(sb); error = sbreserve_locked(sb, cc, so, td); SOCKBUF_UNLOCK(sb); return (error); } /* * Free mbufs held by a socket, and reserved mbuf space. */ void sbrelease_internal(struct sockbuf *sb, struct socket *so) { sbflush_internal(sb); (void)chgsbsize(so->so_cred->cr_uidinfo, &sb->sb_hiwat, 0, RLIM_INFINITY); sb->sb_mbmax = 0; } void sbrelease_locked(struct sockbuf *sb, struct socket *so) { SOCKBUF_LOCK_ASSERT(sb); sbrelease_internal(sb, so); } void sbrelease(struct sockbuf *sb, struct socket *so) { SOCKBUF_LOCK(sb); sbrelease_locked(sb, so); SOCKBUF_UNLOCK(sb); } void sbdestroy(struct sockbuf *sb, struct socket *so) { sbrelease_internal(sb, so); } /* * Routines to add and remove data from an mbuf queue. * * The routines sbappend() or sbappendrecord() are normally called to append * new mbufs to a socket buffer, after checking that adequate space is * available, comparing the function sbspace() with the amount of data to be * added. sbappendrecord() differs from sbappend() in that data supplied is * treated as the beginning of a new record. To place a sender's address, * optional access rights, and data in a socket receive buffer, * sbappendaddr() should be used. To place access rights and data in a * socket receive buffer, sbappendrights() should be used. In either case, * the new data begins a new record. Note that unlike sbappend() and * sbappendrecord(), these routines check for the caller that there will be * enough space to store the data. Each fails if there is not enough space, * or if it cannot find mbufs to store additional information in. * * Reliable protocols may use the socket send buffer to hold data awaiting * acknowledgement. Data is normally copied from a socket send buffer in a * protocol with m_copy for output to a peer, and then removing the data from * the socket buffer with sbdrop() or sbdroprecord() when the data is * acknowledged by the peer. */ #ifdef SOCKBUF_DEBUG void sblastrecordchk(struct sockbuf *sb, const char *file, int line) { struct mbuf *m = sb->sb_mb; SOCKBUF_LOCK_ASSERT(sb); while (m && m->m_nextpkt) m = m->m_nextpkt; if (m != sb->sb_lastrecord) { printf("%s: sb_mb %p sb_lastrecord %p last %p\n", __func__, sb->sb_mb, sb->sb_lastrecord, m); printf("packet chain:\n"); for (m = sb->sb_mb; m != NULL; m = m->m_nextpkt) printf("\t%p\n", m); panic("%s from %s:%u", __func__, file, line); } } void sblastmbufchk(struct sockbuf *sb, const char *file, int line) { struct mbuf *m = sb->sb_mb; struct mbuf *n; SOCKBUF_LOCK_ASSERT(sb); while (m && m->m_nextpkt) m = m->m_nextpkt; while (m && m->m_next) m = m->m_next; if (m != sb->sb_mbtail) { printf("%s: sb_mb %p sb_mbtail %p last %p\n", __func__, sb->sb_mb, sb->sb_mbtail, m); printf("packet tree:\n"); for (m = sb->sb_mb; m != NULL; m = m->m_nextpkt) { printf("\t"); for (n = m; n != NULL; n = n->m_next) printf("%p ", n); printf("\n"); } panic("%s from %s:%u", __func__, file, line); } } #endif /* SOCKBUF_DEBUG */ #define SBLINKRECORD(sb, m0) do { \ SOCKBUF_LOCK_ASSERT(sb); \ if ((sb)->sb_lastrecord != NULL) \ (sb)->sb_lastrecord->m_nextpkt = (m0); \ else \ (sb)->sb_mb = (m0); \ (sb)->sb_lastrecord = (m0); \ } while (/*CONSTCOND*/0) /* * Append mbuf chain m to the last record in the socket buffer sb. The * additional space associated the mbuf chain is recorded in sb. Empty mbufs * are discarded and mbufs are compacted where possible. */ void sbappend_locked(struct sockbuf *sb, struct mbuf *m, int flags) { struct mbuf *n; SOCKBUF_LOCK_ASSERT(sb); if (m == NULL) return; sbm_clrprotoflags(m, flags); SBLASTRECORDCHK(sb); n = sb->sb_mb; if (n) { while (n->m_nextpkt) n = n->m_nextpkt; do { if (n->m_flags & M_EOR) { sbappendrecord_locked(sb, m); /* XXXXXX!!!! */ return; } } while (n->m_next && (n = n->m_next)); } else { /* * XXX Would like to simply use sb_mbtail here, but * XXX I need to verify that I won't miss an EOR that * XXX way. */ if ((n = sb->sb_lastrecord) != NULL) { do { if (n->m_flags & M_EOR) { sbappendrecord_locked(sb, m); /* XXXXXX!!!! */ return; } } while (n->m_next && (n = n->m_next)); } else { /* * If this is the first record in the socket buffer, * it's also the last record. */ sb->sb_lastrecord = m; } } sbcompress(sb, m, n); SBLASTRECORDCHK(sb); } /* * Append mbuf chain m to the last record in the socket buffer sb. The * additional space associated the mbuf chain is recorded in sb. Empty mbufs * are discarded and mbufs are compacted where possible. */ void sbappend(struct sockbuf *sb, struct mbuf *m, int flags) { SOCKBUF_LOCK(sb); sbappend_locked(sb, m, flags); SOCKBUF_UNLOCK(sb); } /* * This version of sbappend() should only be used when the caller absolutely * knows that there will never be more than one record in the socket buffer, * that is, a stream protocol (such as TCP). */ void sbappendstream_locked(struct sockbuf *sb, struct mbuf *m, int flags) { SOCKBUF_LOCK_ASSERT(sb); KASSERT(m->m_nextpkt == NULL,("sbappendstream 0")); KASSERT(sb->sb_mb == sb->sb_lastrecord,("sbappendstream 1")); SBLASTMBUFCHK(sb); /* Remove all packet headers and mbuf tags to get a pure data chain. */ m_demote(m, 1, flags & PRUS_NOTREADY ? M_NOTREADY : 0); sbcompress(sb, m, sb->sb_mbtail); sb->sb_lastrecord = sb->sb_mb; SBLASTRECORDCHK(sb); } /* * This version of sbappend() should only be used when the caller absolutely * knows that there will never be more than one record in the socket buffer, * that is, a stream protocol (such as TCP). */ void sbappendstream(struct sockbuf *sb, struct mbuf *m, int flags) { SOCKBUF_LOCK(sb); sbappendstream_locked(sb, m, flags); SOCKBUF_UNLOCK(sb); } #ifdef SOCKBUF_DEBUG void sbcheck(struct sockbuf *sb, const char *file, int line) { struct mbuf *m, *n, *fnrdy; u_long acc, ccc, mbcnt; SOCKBUF_LOCK_ASSERT(sb); acc = ccc = mbcnt = 0; fnrdy = NULL; for (m = sb->sb_mb; m; m = n) { n = m->m_nextpkt; for (; m; m = m->m_next) { if (m->m_len == 0) { printf("sb %p empty mbuf %p\n", sb, m); goto fail; } if ((m->m_flags & M_NOTREADY) && fnrdy == NULL) { if (m != sb->sb_fnrdy) { printf("sb %p: fnrdy %p != m %p\n", sb, sb->sb_fnrdy, m); goto fail; } fnrdy = m; } if (fnrdy) { if (!(m->m_flags & M_NOTAVAIL)) { printf("sb %p: fnrdy %p, m %p is avail\n", sb, sb->sb_fnrdy, m); goto fail; } } else acc += m->m_len; ccc += m->m_len; mbcnt += MSIZE; if (m->m_flags & M_EXT) /*XXX*/ /* pretty sure this is bogus */ mbcnt += m->m_ext.ext_size; } } if (acc != sb->sb_acc || ccc != sb->sb_ccc || mbcnt != sb->sb_mbcnt) { printf("acc %ld/%u ccc %ld/%u mbcnt %ld/%u\n", acc, sb->sb_acc, ccc, sb->sb_ccc, mbcnt, sb->sb_mbcnt); goto fail; } return; fail: panic("%s from %s:%u", __func__, file, line); } #endif /* * As above, except the mbuf chain begins a new record. */ void sbappendrecord_locked(struct sockbuf *sb, struct mbuf *m0) { struct mbuf *m; SOCKBUF_LOCK_ASSERT(sb); if (m0 == NULL) return; m_clrprotoflags(m0); /* * Put the first mbuf on the queue. Note this permits zero length * records. */ sballoc(sb, m0); SBLASTRECORDCHK(sb); SBLINKRECORD(sb, m0); sb->sb_mbtail = m0; m = m0->m_next; m0->m_next = 0; if (m && (m0->m_flags & M_EOR)) { m0->m_flags &= ~M_EOR; m->m_flags |= M_EOR; } /* always call sbcompress() so it can do SBLASTMBUFCHK() */ sbcompress(sb, m, m0); } /* * As above, except the mbuf chain begins a new record. */ void sbappendrecord(struct sockbuf *sb, struct mbuf *m0) { SOCKBUF_LOCK(sb); sbappendrecord_locked(sb, m0); SOCKBUF_UNLOCK(sb); } /* Helper routine that appends data, control, and address to a sockbuf. */ static int sbappendaddr_locked_internal(struct sockbuf *sb, const struct sockaddr *asa, struct mbuf *m0, struct mbuf *control, struct mbuf *ctrl_last) { struct mbuf *m, *n, *nlast; #if MSIZE <= 256 if (asa->sa_len > MLEN) return (0); #endif m = m_get(M_NOWAIT, MT_SONAME); if (m == NULL) return (0); m->m_len = asa->sa_len; bcopy(asa, mtod(m, caddr_t), asa->sa_len); if (m0) { m_clrprotoflags(m0); m_tag_delete_chain(m0, NULL); /* * Clear some persistent info from pkthdr. * We don't use m_demote(), because some netgraph consumers * expect M_PKTHDR presence. */ m0->m_pkthdr.rcvif = NULL; m0->m_pkthdr.flowid = 0; m0->m_pkthdr.csum_flags = 0; m0->m_pkthdr.fibnum = 0; m0->m_pkthdr.rsstype = 0; } if (ctrl_last) ctrl_last->m_next = m0; /* concatenate data to control */ else control = m0; m->m_next = control; for (n = m; n->m_next != NULL; n = n->m_next) sballoc(sb, n); sballoc(sb, n); nlast = n; SBLINKRECORD(sb, m); sb->sb_mbtail = nlast; SBLASTMBUFCHK(sb); SBLASTRECORDCHK(sb); return (1); } /* * Append address and data, and optionally, control (ancillary) data to the * receive queue of a socket. If present, m0 must include a packet header * with total length. Returns 0 if no space in sockbuf or insufficient * mbufs. */ int sbappendaddr_locked(struct sockbuf *sb, const struct sockaddr *asa, struct mbuf *m0, struct mbuf *control) { struct mbuf *ctrl_last; int space = asa->sa_len; SOCKBUF_LOCK_ASSERT(sb); if (m0 && (m0->m_flags & M_PKTHDR) == 0) panic("sbappendaddr_locked"); if (m0) space += m0->m_pkthdr.len; space += m_length(control, &ctrl_last); if (space > sbspace(sb)) return (0); return (sbappendaddr_locked_internal(sb, asa, m0, control, ctrl_last)); } /* * Append address and data, and optionally, control (ancillary) data to the * receive queue of a socket. If present, m0 must include a packet header * with total length. Returns 0 if insufficient mbufs. Does not validate space * on the receiving sockbuf. */ int sbappendaddr_nospacecheck_locked(struct sockbuf *sb, const struct sockaddr *asa, struct mbuf *m0, struct mbuf *control) { struct mbuf *ctrl_last; SOCKBUF_LOCK_ASSERT(sb); ctrl_last = (control == NULL) ? NULL : m_last(control); return (sbappendaddr_locked_internal(sb, asa, m0, control, ctrl_last)); } /* * Append address and data, and optionally, control (ancillary) data to the * receive queue of a socket. If present, m0 must include a packet header * with total length. Returns 0 if no space in sockbuf or insufficient * mbufs. */ int sbappendaddr(struct sockbuf *sb, const struct sockaddr *asa, struct mbuf *m0, struct mbuf *control) { int retval; SOCKBUF_LOCK(sb); retval = sbappendaddr_locked(sb, asa, m0, control); SOCKBUF_UNLOCK(sb); return (retval); } int sbappendcontrol_locked(struct sockbuf *sb, struct mbuf *m0, struct mbuf *control) { struct mbuf *m, *n, *mlast; int space; SOCKBUF_LOCK_ASSERT(sb); if (control == NULL) panic("sbappendcontrol_locked"); space = m_length(control, &n) + m_length(m0, NULL); if (space > sbspace(sb)) return (0); m_clrprotoflags(m0); n->m_next = m0; /* concatenate data to control */ SBLASTRECORDCHK(sb); for (m = control; m->m_next; m = m->m_next) sballoc(sb, m); sballoc(sb, m); mlast = m; SBLINKRECORD(sb, control); sb->sb_mbtail = mlast; SBLASTMBUFCHK(sb); SBLASTRECORDCHK(sb); return (1); } int sbappendcontrol(struct sockbuf *sb, struct mbuf *m0, struct mbuf *control) { int retval; SOCKBUF_LOCK(sb); retval = sbappendcontrol_locked(sb, m0, control); SOCKBUF_UNLOCK(sb); return (retval); } /* * Append the data in mbuf chain (m) into the socket buffer sb following mbuf * (n). If (n) is NULL, the buffer is presumed empty. * * When the data is compressed, mbufs in the chain may be handled in one of * three ways: * * (1) The mbuf may simply be dropped, if it contributes nothing (no data, no * record boundary, and no change in data type). * * (2) The mbuf may be coalesced -- i.e., data in the mbuf may be copied into * an mbuf already in the socket buffer. This can occur if an * appropriate mbuf exists, there is room, both mbufs are not marked as * not ready, and no merging of data types will occur. * * (3) The mbuf may be appended to the end of the existing mbuf chain. * * If any of the new mbufs is marked as M_EOR, mark the last mbuf appended as * end-of-record. */ void sbcompress(struct sockbuf *sb, struct mbuf *m, struct mbuf *n) { int eor = 0; struct mbuf *o; SOCKBUF_LOCK_ASSERT(sb); while (m) { eor |= m->m_flags & M_EOR; if (m->m_len == 0 && (eor == 0 || (((o = m->m_next) || (o = n)) && o->m_type == m->m_type))) { if (sb->sb_lastrecord == m) sb->sb_lastrecord = m->m_next; m = m_free(m); continue; } if (n && (n->m_flags & M_EOR) == 0 && M_WRITABLE(n) && ((sb->sb_flags & SB_NOCOALESCE) == 0) && !(m->m_flags & M_NOTREADY) && !(n->m_flags & M_NOTREADY) && m->m_len <= MCLBYTES / 4 && /* XXX: Don't copy too much */ m->m_len <= M_TRAILINGSPACE(n) && n->m_type == m->m_type) { bcopy(mtod(m, caddr_t), mtod(n, caddr_t) + n->m_len, (unsigned)m->m_len); n->m_len += m->m_len; sb->sb_ccc += m->m_len; if (sb->sb_fnrdy == NULL) sb->sb_acc += m->m_len; if (m->m_type != MT_DATA && m->m_type != MT_OOBDATA) /* XXX: Probably don't need.*/ sb->sb_ctl += m->m_len; m = m_free(m); continue; } if (n) n->m_next = m; else sb->sb_mb = m; sb->sb_mbtail = m; sballoc(sb, m); n = m; m->m_flags &= ~M_EOR; m = m->m_next; n->m_next = 0; } if (eor) { KASSERT(n != NULL, ("sbcompress: eor && n == NULL")); n->m_flags |= eor; } SBLASTMBUFCHK(sb); } /* * Free all mbufs in a sockbuf. Check that all resources are reclaimed. */ static void sbflush_internal(struct sockbuf *sb) { while (sb->sb_mbcnt) { /* * Don't call sbcut(sb, 0) if the leading mbuf is non-empty: * we would loop forever. Panic instead. */ if (sb->sb_ccc == 0 && (sb->sb_mb == NULL || sb->sb_mb->m_len)) break; m_freem(sbcut_internal(sb, (int)sb->sb_ccc)); } KASSERT(sb->sb_ccc == 0 && sb->sb_mb == 0 && sb->sb_mbcnt == 0, ("%s: ccc %u mb %p mbcnt %u", __func__, sb->sb_ccc, (void *)sb->sb_mb, sb->sb_mbcnt)); } void sbflush_locked(struct sockbuf *sb) { SOCKBUF_LOCK_ASSERT(sb); sbflush_internal(sb); } void sbflush(struct sockbuf *sb) { SOCKBUF_LOCK(sb); sbflush_locked(sb); SOCKBUF_UNLOCK(sb); } /* * Cut data from (the front of) a sockbuf. */ static struct mbuf * sbcut_internal(struct sockbuf *sb, int len) { struct mbuf *m, *next, *mfree; KASSERT(len >= 0, ("%s: len is %d but it is supposed to be >= 0", __func__, len)); KASSERT(len <= sb->sb_ccc, ("%s: len: %d is > ccc: %u", __func__, len, sb->sb_ccc)); next = (m = sb->sb_mb) ? m->m_nextpkt : 0; mfree = NULL; while (len > 0) { if (m == NULL) { KASSERT(next, ("%s: no next, len %d", __func__, len)); m = next; next = m->m_nextpkt; } if (m->m_len > len) { KASSERT(!(m->m_flags & M_NOTAVAIL), ("%s: m %p M_NOTAVAIL", __func__, m)); m->m_len -= len; m->m_data += len; sb->sb_ccc -= len; sb->sb_acc -= len; if (sb->sb_sndptroff != 0) sb->sb_sndptroff -= len; if (m->m_type != MT_DATA && m->m_type != MT_OOBDATA) sb->sb_ctl -= len; break; } len -= m->m_len; sbfree(sb, m); /* * Do not put M_NOTREADY buffers to the free list, they * are referenced from outside. */ if (m->m_flags & M_NOTREADY) m = m->m_next; else { struct mbuf *n; n = m->m_next; m->m_next = mfree; mfree = m; m = n; } } /* * Free any zero-length mbufs from the buffer. * For SOCK_DGRAM sockets such mbufs represent empty records. * XXX: For SOCK_STREAM sockets such mbufs can appear in the buffer, * when sosend_generic() needs to send only control data. */ while (m && m->m_len == 0) { struct mbuf *n; sbfree(sb, m); n = m->m_next; m->m_next = mfree; mfree = m; m = n; } if (m) { sb->sb_mb = m; m->m_nextpkt = next; } else sb->sb_mb = next; /* * First part is an inline SB_EMPTY_FIXUP(). Second part makes sure * sb_lastrecord is up-to-date if we dropped part of the last record. */ m = sb->sb_mb; if (m == NULL) { sb->sb_mbtail = NULL; sb->sb_lastrecord = NULL; } else if (m->m_nextpkt == NULL) { sb->sb_lastrecord = m; } return (mfree); } /* * Drop data from (the front of) a sockbuf. */ void sbdrop_locked(struct sockbuf *sb, int len) { SOCKBUF_LOCK_ASSERT(sb); m_freem(sbcut_internal(sb, len)); } /* * Drop data from (the front of) a sockbuf, * and return it to caller. */ struct mbuf * sbcut_locked(struct sockbuf *sb, int len) { SOCKBUF_LOCK_ASSERT(sb); return (sbcut_internal(sb, len)); } void sbdrop(struct sockbuf *sb, int len) { struct mbuf *mfree; SOCKBUF_LOCK(sb); mfree = sbcut_internal(sb, len); SOCKBUF_UNLOCK(sb); m_freem(mfree); } /* * Maintain a pointer and offset pair into the socket buffer mbuf chain to * avoid traversal of the entire socket buffer for larger offsets. */ struct mbuf * sbsndptr(struct sockbuf *sb, u_int off, u_int len, u_int *moff) { struct mbuf *m, *ret; KASSERT(sb->sb_mb != NULL, ("%s: sb_mb is NULL", __func__)); KASSERT(off + len <= sb->sb_acc, ("%s: beyond sb", __func__)); KASSERT(sb->sb_sndptroff <= sb->sb_acc, ("%s: sndptroff broken", __func__)); /* * Is off below stored offset? Happens on retransmits. * Just return, we can't help here. */ if (sb->sb_sndptroff > off) { *moff = off; return (sb->sb_mb); } /* Return closest mbuf in chain for current offset. */ *moff = off - sb->sb_sndptroff; m = ret = sb->sb_sndptr ? sb->sb_sndptr : sb->sb_mb; if (*moff == m->m_len) { *moff = 0; sb->sb_sndptroff += m->m_len; m = ret = m->m_next; KASSERT(ret->m_len > 0, ("mbuf %p in sockbuf %p chain has no valid data", ret, sb)); } /* Advance by len to be as close as possible for the next transmit. */ for (off = off - sb->sb_sndptroff + len - 1; off > 0 && m != NULL && off >= m->m_len; m = m->m_next) { sb->sb_sndptroff += m->m_len; off -= m->m_len; } if (off > 0 && m == NULL) panic("%s: sockbuf %p and mbuf %p clashing", __func__, sb, ret); sb->sb_sndptr = m; return (ret); } /* * Return the first mbuf and the mbuf data offset for the provided * send offset without changing the "sb_sndptroff" field. */ struct mbuf * sbsndmbuf(struct sockbuf *sb, u_int off, u_int *moff) { struct mbuf *m; KASSERT(sb->sb_mb != NULL, ("%s: sb_mb is NULL", __func__)); /* * If the "off" is below the stored offset, which happens on * retransmits, just use "sb_mb": */ if (sb->sb_sndptr == NULL || sb->sb_sndptroff > off) { m = sb->sb_mb; } else { m = sb->sb_sndptr; off -= sb->sb_sndptroff; } while (off > 0 && m != NULL) { if (off < m->m_len) break; off -= m->m_len; m = m->m_next; } *moff = off; return (m); } /* * Drop a record off the front of a sockbuf and move the next record to the * front. */ void sbdroprecord_locked(struct sockbuf *sb) { struct mbuf *m; SOCKBUF_LOCK_ASSERT(sb); m = sb->sb_mb; if (m) { sb->sb_mb = m->m_nextpkt; do { sbfree(sb, m); m = m_free(m); } while (m); } SB_EMPTY_FIXUP(sb); } /* * Drop a record off the front of a sockbuf and move the next record to the * front. */ void sbdroprecord(struct sockbuf *sb) { SOCKBUF_LOCK(sb); sbdroprecord_locked(sb); SOCKBUF_UNLOCK(sb); } /* * Create a "control" mbuf containing the specified data with the specified * type for presentation on a socket buffer. */ struct mbuf * sbcreatecontrol(caddr_t p, int size, int type, int level) { struct cmsghdr *cp; struct mbuf *m; if (CMSG_SPACE((u_int)size) > MCLBYTES) return ((struct mbuf *) NULL); if (CMSG_SPACE((u_int)size) > MLEN) m = m_getcl(M_NOWAIT, MT_CONTROL, 0); else m = m_get(M_NOWAIT, MT_CONTROL); if (m == NULL) return ((struct mbuf *) NULL); cp = mtod(m, struct cmsghdr *); m->m_len = 0; KASSERT(CMSG_SPACE((u_int)size) <= M_TRAILINGSPACE(m), ("sbcreatecontrol: short mbuf")); /* * Don't leave the padding between the msg header and the * cmsg data and the padding after the cmsg data un-initialized. */ bzero(cp, CMSG_SPACE((u_int)size)); if (p != NULL) (void)memcpy(CMSG_DATA(cp), p, size); m->m_len = CMSG_SPACE(size); cp->cmsg_len = CMSG_LEN(size); cp->cmsg_level = level; cp->cmsg_type = type; return (m); } /* * This does the same for socket buffers that sotoxsocket does for sockets: * generate an user-format data structure describing the socket buffer. Note * that the xsockbuf structure, since it is always embedded in a socket, does * not include a self pointer nor a length. We make this entry point public * in case some other mechanism needs it. */ void sbtoxsockbuf(struct sockbuf *sb, struct xsockbuf *xsb) { xsb->sb_cc = sb->sb_ccc; xsb->sb_hiwat = sb->sb_hiwat; xsb->sb_mbcnt = sb->sb_mbcnt; xsb->sb_mcnt = sb->sb_mcnt; xsb->sb_ccnt = sb->sb_ccnt; xsb->sb_mbmax = sb->sb_mbmax; xsb->sb_lowat = sb->sb_lowat; xsb->sb_flags = sb->sb_flags; xsb->sb_timeo = sb->sb_timeo; } /* This takes the place of kern.maxsockbuf, which moved to kern.ipc. */ static int dummy; SYSCTL_INT(_kern, KERN_DUMMY, dummy, CTLFLAG_RW, &dummy, 0, ""); SYSCTL_OID(_kern_ipc, KIPC_MAXSOCKBUF, maxsockbuf, CTLTYPE_ULONG|CTLFLAG_RW, &sb_max, 0, sysctl_handle_sb_max, "LU", "Maximum socket buffer size"); SYSCTL_ULONG(_kern_ipc, KIPC_SOCKBUF_WASTE, sockbuf_waste_factor, CTLFLAG_RW, &sb_efficiency, 0, "Socket buffer size waste factor"); Index: head/sys/kern/uipc_socket.c =================================================================== --- head/sys/kern/uipc_socket.c (revision 319721) +++ head/sys/kern/uipc_socket.c (revision 319722) @@ -1,3777 +1,4124 @@ /*- * Copyright (c) 1982, 1986, 1988, 1990, 1993 * The Regents of the University of California. * Copyright (c) 2004 The FreeBSD Foundation * Copyright (c) 2004-2008 Robert N. M. Watson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94 */ /* * Comments on the socket life cycle: * * soalloc() sets of socket layer state for a socket, called only by * socreate() and sonewconn(). Socket layer private. * * sodealloc() tears down socket layer state for a socket, called only by * sofree() and sonewconn(). Socket layer private. * * pru_attach() associates protocol layer state with an allocated socket; * called only once, may fail, aborting socket allocation. This is called * from socreate() and sonewconn(). Socket layer private. * * pru_detach() disassociates protocol layer state from an attached socket, * and will be called exactly once for sockets in which pru_attach() has * been successfully called. If pru_attach() returned an error, * pru_detach() will not be called. Socket layer private. * * pru_abort() and pru_close() notify the protocol layer that the last * consumer of a socket is starting to tear down the socket, and that the * protocol should terminate the connection. Historically, pru_abort() also * detached protocol state from the socket state, but this is no longer the * case. * * socreate() creates a socket and attaches protocol state. This is a public * interface that may be used by socket layer consumers to create new * sockets. * * sonewconn() creates a socket and attaches protocol state. This is a * public interface that may be used by protocols to create new sockets when * a new connection is received and will be available for accept() on a * listen socket. * * soclose() destroys a socket after possibly waiting for it to disconnect. * This is a public interface that socket consumers should use to close and * release a socket when done with it. * * soabort() destroys a socket without waiting for it to disconnect (used * only for incoming connections that are already partially or fully * connected). This is used internally by the socket layer when clearing * listen socket queues (due to overflow or close on the listen socket), but * is also a public interface protocols may use to abort connections in * their incomplete listen queues should they no longer be required. Sockets * placed in completed connection listen queues should not be aborted for * reasons described in the comment above the soclose() implementation. This * is not a general purpose close routine, and except in the specific * circumstances described here, should not be used. * * sofree() will free a socket and its protocol state if all references on * the socket have been released, and is the public interface to attempt to * free a socket when a reference is removed. This is a socket layer private * interface. * * NOTE: In addition to socreate() and soclose(), which provide a single * socket reference to the consumer to be managed as required, there are two * calls to explicitly manage socket references, soref(), and sorele(). * Currently, these are generally required only when transitioning a socket * from a listen queue to a file descriptor, in order to prevent garbage * collection of the socket at an untimely moment. For a number of reasons, * these interfaces are not preferred, and should be avoided. * * NOTE: With regard to VNETs the general rule is that callers do not set * curvnet. Exceptions to this rule include soabort(), sodisconnect(), * sofree() (and with that sorele(), sotryfree()), as well as sonewconn() * and sorflush(), which are usually called from a pre-set VNET context. * sopoll() currently does not need a VNET context to be set. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_compat.h" +#include "opt_sctp.h" #include #include #include #include #include #include #include #include #include #include #include /* for struct knote */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef COMPAT_FREEBSD32 #include #include #include #endif static int soreceive_rcvoob(struct socket *so, struct uio *uio, int flags); +static void so_rdknl_lock(void *); +static void so_rdknl_unlock(void *); +static void so_rdknl_assert_locked(void *); +static void so_rdknl_assert_unlocked(void *); +static void so_wrknl_lock(void *); +static void so_wrknl_unlock(void *); +static void so_wrknl_assert_locked(void *); +static void so_wrknl_assert_unlocked(void *); static void filt_sordetach(struct knote *kn); static int filt_soread(struct knote *kn, long hint); static void filt_sowdetach(struct knote *kn); static int filt_sowrite(struct knote *kn, long hint); -static int inline hhook_run_socket(struct socket *so, void *hctx, int32_t h_id); static int filt_soempty(struct knote *kn, long hint); +static int inline hhook_run_socket(struct socket *so, void *hctx, int32_t h_id); fo_kqfilter_t soo_kqfilter; static struct filterops soread_filtops = { .f_isfd = 1, .f_detach = filt_sordetach, .f_event = filt_soread, }; static struct filterops sowrite_filtops = { .f_isfd = 1, .f_detach = filt_sowdetach, .f_event = filt_sowrite, }; static struct filterops soempty_filtops = { .f_isfd = 1, .f_detach = filt_sowdetach, .f_event = filt_soempty, }; so_gen_t so_gencnt; /* generation count for sockets */ MALLOC_DEFINE(M_SONAME, "soname", "socket name"); MALLOC_DEFINE(M_PCB, "pcb", "protocol control block"); #define VNET_SO_ASSERT(so) \ VNET_ASSERT(curvnet != NULL, \ ("%s:%d curvnet is NULL, so=%p", __func__, __LINE__, (so))); VNET_DEFINE(struct hhook_head *, socket_hhh[HHOOK_SOCKET_LAST + 1]); #define V_socket_hhh VNET(socket_hhh) /* * Limit on the number of connections in the listen queue waiting * for accept(2). * NB: The original sysctl somaxconn is still available but hidden * to prevent confusion about the actual purpose of this number. */ static u_int somaxconn = SOMAXCONN; static int sysctl_somaxconn(SYSCTL_HANDLER_ARGS) { int error; int val; val = somaxconn; error = sysctl_handle_int(oidp, &val, 0, req); if (error || !req->newptr ) return (error); /* * The purpose of the UINT_MAX / 3 limit, is so that the formula * 3 * so_qlimit / 2 * below, will not overflow. */ if (val < 1 || val > UINT_MAX / 3) return (EINVAL); somaxconn = val; return (0); } SYSCTL_PROC(_kern_ipc, OID_AUTO, soacceptqueue, CTLTYPE_UINT | CTLFLAG_RW, 0, sizeof(int), sysctl_somaxconn, "I", "Maximum listen socket pending connection accept queue size"); SYSCTL_PROC(_kern_ipc, KIPC_SOMAXCONN, somaxconn, CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_SKIP, 0, sizeof(int), sysctl_somaxconn, "I", "Maximum listen socket pending connection accept queue size (compat)"); static int numopensockets; SYSCTL_INT(_kern_ipc, OID_AUTO, numopensockets, CTLFLAG_RD, &numopensockets, 0, "Number of open sockets"); /* * accept_mtx locks down per-socket fields relating to accept queues. See * socketvar.h for an annotation of the protected fields of struct socket. */ struct mtx accept_mtx; MTX_SYSINIT(accept_mtx, &accept_mtx, "accept", MTX_DEF); /* * so_global_mtx protects so_gencnt, numopensockets, and the per-socket * so_gencnt field. */ static struct mtx so_global_mtx; MTX_SYSINIT(so_global_mtx, &so_global_mtx, "so_glabel", MTX_DEF); /* * General IPC sysctl name space, used by sockets and a variety of other IPC * types. */ SYSCTL_NODE(_kern, KERN_IPC, ipc, CTLFLAG_RW, 0, "IPC"); /* * Initialize the socket subsystem and set up the socket * memory allocator. */ static uma_zone_t socket_zone; int maxsockets; static void socket_zone_change(void *tag) { maxsockets = uma_zone_set_max(socket_zone, maxsockets); } static void socket_hhook_register(int subtype) { if (hhook_head_register(HHOOK_TYPE_SOCKET, subtype, &V_socket_hhh[subtype], HHOOK_NOWAIT|HHOOK_HEADISINVNET) != 0) printf("%s: WARNING: unable to register hook\n", __func__); } static void socket_hhook_deregister(int subtype) { if (hhook_head_deregister(V_socket_hhh[subtype]) != 0) printf("%s: WARNING: unable to deregister hook\n", __func__); } static void socket_init(void *tag) { socket_zone = uma_zcreate("socket", sizeof(struct socket), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); maxsockets = uma_zone_set_max(socket_zone, maxsockets); uma_zone_set_warning(socket_zone, "kern.ipc.maxsockets limit reached"); EVENTHANDLER_REGISTER(maxsockets_change, socket_zone_change, NULL, EVENTHANDLER_PRI_FIRST); } SYSINIT(socket, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, socket_init, NULL); static void socket_vnet_init(const void *unused __unused) { int i; /* We expect a contiguous range */ for (i = 0; i <= HHOOK_SOCKET_LAST; i++) socket_hhook_register(i); } VNET_SYSINIT(socket_vnet_init, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, socket_vnet_init, NULL); static void socket_vnet_uninit(const void *unused __unused) { int i; for (i = 0; i <= HHOOK_SOCKET_LAST; i++) socket_hhook_deregister(i); } VNET_SYSUNINIT(socket_vnet_uninit, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, socket_vnet_uninit, NULL); /* * Initialise maxsockets. This SYSINIT must be run after * tunable_mbinit(). */ static void init_maxsockets(void *ignored) { TUNABLE_INT_FETCH("kern.ipc.maxsockets", &maxsockets); maxsockets = imax(maxsockets, maxfiles); } SYSINIT(param, SI_SUB_TUNABLES, SI_ORDER_ANY, init_maxsockets, NULL); /* * Sysctl to get and set the maximum global sockets limit. Notify protocols * of the change so that they can update their dependent limits as required. */ static int sysctl_maxsockets(SYSCTL_HANDLER_ARGS) { int error, newmaxsockets; newmaxsockets = maxsockets; error = sysctl_handle_int(oidp, &newmaxsockets, 0, req); if (error == 0 && req->newptr) { if (newmaxsockets > maxsockets && newmaxsockets <= maxfiles) { maxsockets = newmaxsockets; EVENTHANDLER_INVOKE(maxsockets_change); } else error = EINVAL; } return (error); } SYSCTL_PROC(_kern_ipc, OID_AUTO, maxsockets, CTLTYPE_INT|CTLFLAG_RW, &maxsockets, 0, sysctl_maxsockets, "IU", "Maximum number of sockets available"); /* * Socket operation routines. These routines are called by the routines in * sys_socket.c or from a system process, and implement the semantics of * socket operations by switching out to the protocol specific routines. */ /* * Get a socket structure from our zone, and initialize it. Note that it * would probably be better to allocate socket and PCB at the same time, but * I'm not convinced that all the protocols can be easily modified to do * this. * * soalloc() returns a socket with a ref count of 0. */ static struct socket * soalloc(struct vnet *vnet) { struct socket *so; so = uma_zalloc(socket_zone, M_NOWAIT | M_ZERO); if (so == NULL) return (NULL); #ifdef MAC if (mac_socket_init(so, M_NOWAIT) != 0) { uma_zfree(socket_zone, so); return (NULL); } #endif if (khelp_init_osd(HELPER_CLASS_SOCKET, &so->osd)) { uma_zfree(socket_zone, so); return (NULL); } + /* + * The socket locking protocol allows to lock 2 sockets at a time, + * however, the first one must be a listening socket. WITNESS lacks + * a feature to change class of an existing lock, so we use DUPOK. + */ + mtx_init(&so->so_lock, "socket", NULL, MTX_DEF | MTX_DUPOK); SOCKBUF_LOCK_INIT(&so->so_snd, "so_snd"); SOCKBUF_LOCK_INIT(&so->so_rcv, "so_rcv"); + so->so_rcv.sb_sel = &so->so_rdsel; + so->so_snd.sb_sel = &so->so_wrsel; sx_init(&so->so_snd.sb_sx, "so_snd_sx"); sx_init(&so->so_rcv.sb_sx, "so_rcv_sx"); TAILQ_INIT(&so->so_snd.sb_aiojobq); TAILQ_INIT(&so->so_rcv.sb_aiojobq); TASK_INIT(&so->so_snd.sb_aiotask, 0, soaio_snd, so); TASK_INIT(&so->so_rcv.sb_aiotask, 0, soaio_rcv, so); #ifdef VIMAGE VNET_ASSERT(vnet != NULL, ("%s:%d vnet is NULL, so=%p", __func__, __LINE__, so)); so->so_vnet = vnet; #endif /* We shouldn't need the so_global_mtx */ if (hhook_run_socket(so, NULL, HHOOK_SOCKET_CREATE)) { /* Do we need more comprehensive error returns? */ uma_zfree(socket_zone, so); return (NULL); } mtx_lock(&so_global_mtx); so->so_gencnt = ++so_gencnt; ++numopensockets; #ifdef VIMAGE vnet->vnet_sockcnt++; #endif mtx_unlock(&so_global_mtx); return (so); } /* * Free the storage associated with a socket at the socket layer, tear down * locks, labels, etc. All protocol state is assumed already to have been * torn down (and possibly never set up) by the caller. */ static void sodealloc(struct socket *so) { KASSERT(so->so_count == 0, ("sodealloc(): so_count %d", so->so_count)); KASSERT(so->so_pcb == NULL, ("sodealloc(): so_pcb != NULL")); mtx_lock(&so_global_mtx); so->so_gencnt = ++so_gencnt; --numopensockets; /* Could be below, but faster here. */ #ifdef VIMAGE VNET_ASSERT(so->so_vnet != NULL, ("%s:%d so_vnet is NULL, so=%p", __func__, __LINE__, so)); so->so_vnet->vnet_sockcnt--; #endif mtx_unlock(&so_global_mtx); if (so->so_rcv.sb_hiwat) (void)chgsbsize(so->so_cred->cr_uidinfo, &so->so_rcv.sb_hiwat, 0, RLIM_INFINITY); if (so->so_snd.sb_hiwat) (void)chgsbsize(so->so_cred->cr_uidinfo, &so->so_snd.sb_hiwat, 0, RLIM_INFINITY); - /* remove accept filter if one is present. */ - if (so->so_accf != NULL) - accept_filt_setopt(so, NULL); #ifdef MAC mac_socket_destroy(so); #endif hhook_run_socket(so, NULL, HHOOK_SOCKET_CLOSE); crfree(so->so_cred); khelp_destroy_osd(&so->osd); - sx_destroy(&so->so_snd.sb_sx); - sx_destroy(&so->so_rcv.sb_sx); - SOCKBUF_LOCK_DESTROY(&so->so_snd); - SOCKBUF_LOCK_DESTROY(&so->so_rcv); + if (SOLISTENING(so)) { + if (so->sol_accept_filter != NULL) + accept_filt_setopt(so, NULL); + } else { + sx_destroy(&so->so_snd.sb_sx); + sx_destroy(&so->so_rcv.sb_sx); + SOCKBUF_LOCK_DESTROY(&so->so_snd); + SOCKBUF_LOCK_DESTROY(&so->so_rcv); + } + mtx_destroy(&so->so_lock); uma_zfree(socket_zone, so); } /* * socreate returns a socket with a ref count of 1. The socket should be * closed with soclose(). */ int socreate(int dom, struct socket **aso, int type, int proto, struct ucred *cred, struct thread *td) { struct protosw *prp; struct socket *so; int error; if (proto) prp = pffindproto(dom, proto, type); else prp = pffindtype(dom, type); if (prp == NULL) { /* No support for domain. */ if (pffinddomain(dom) == NULL) return (EAFNOSUPPORT); /* No support for socket type. */ if (proto == 0 && type != 0) return (EPROTOTYPE); return (EPROTONOSUPPORT); } if (prp->pr_usrreqs->pru_attach == NULL || prp->pr_usrreqs->pru_attach == pru_attach_notsupp) return (EPROTONOSUPPORT); if (prison_check_af(cred, prp->pr_domain->dom_family) != 0) return (EPROTONOSUPPORT); if (prp->pr_type != type) return (EPROTOTYPE); so = soalloc(CRED_TO_VNET(cred)); if (so == NULL) return (ENOBUFS); - TAILQ_INIT(&so->so_incomp); - TAILQ_INIT(&so->so_comp); so->so_type = type; so->so_cred = crhold(cred); if ((prp->pr_domain->dom_family == PF_INET) || (prp->pr_domain->dom_family == PF_INET6) || (prp->pr_domain->dom_family == PF_ROUTE)) so->so_fibnum = td->td_proc->p_fibnum; else so->so_fibnum = 0; so->so_proto = prp; #ifdef MAC mac_socket_create(cred, so); #endif - knlist_init_mtx(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv)); - knlist_init_mtx(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd)); - so->so_count = 1; + knlist_init(&so->so_rdsel.si_note, so, so_rdknl_lock, so_rdknl_unlock, + so_rdknl_assert_locked, so_rdknl_assert_unlocked); + knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock, + so_wrknl_assert_locked, so_wrknl_assert_unlocked); /* * Auto-sizing of socket buffers is managed by the protocols and * the appropriate flags must be set in the pru_attach function. */ CURVNET_SET(so->so_vnet); error = (*prp->pr_usrreqs->pru_attach)(so, proto, td); CURVNET_RESTORE(); if (error) { - KASSERT(so->so_count == 1, ("socreate: so_count %d", - so->so_count)); - so->so_count = 0; sodealloc(so); return (error); } + soref(so); *aso = so; return (0); } #ifdef REGRESSION static int regression_sonewconn_earlytest = 1; SYSCTL_INT(_regression, OID_AUTO, sonewconn_earlytest, CTLFLAG_RW, ®ression_sonewconn_earlytest, 0, "Perform early sonewconn limit test"); #endif /* * When an attempt at a new connection is noted on a socket which accepts * connections, sonewconn is called. If the connection is possible (subject * to space constraints, etc.) then we allocate a new structure, properly * linked into the data structure of the original socket, and return this. * Connstatus may be 0, or SS_ISCONFIRMING, or SS_ISCONNECTED. * * Note: the ref count on the socket is 0 on return. */ struct socket * sonewconn(struct socket *head, int connstatus) { static struct timeval lastover; static struct timeval overinterval = { 60, 0 }; static int overcount; struct socket *so; - int over; + u_int over; - ACCEPT_LOCK(); - over = (head->so_qlen > 3 * head->so_qlimit / 2); - ACCEPT_UNLOCK(); + SOLISTEN_LOCK(head); + over = (head->sol_qlen > 3 * head->sol_qlimit / 2); + SOLISTEN_UNLOCK(head); #ifdef REGRESSION if (regression_sonewconn_earlytest && over) { #else if (over) { #endif overcount++; if (ratecheck(&lastover, &overinterval)) { log(LOG_DEBUG, "%s: pcb %p: Listen queue overflow: " "%i already in queue awaiting acceptance " "(%d occurrences)\n", - __func__, head->so_pcb, head->so_qlen, overcount); + __func__, head->so_pcb, head->sol_qlen, overcount); overcount = 0; } return (NULL); } - VNET_ASSERT(head->so_vnet != NULL, ("%s:%d so_vnet is NULL, head=%p", - __func__, __LINE__, head)); + VNET_ASSERT(head->so_vnet != NULL, ("%s: so %p vnet is NULL", + __func__, head)); so = soalloc(head->so_vnet); if (so == NULL) { log(LOG_DEBUG, "%s: pcb %p: New socket allocation failure: " "limit reached or out of memory\n", __func__, head->so_pcb); return (NULL); } - if ((head->so_options & SO_ACCEPTFILTER) != 0) - connstatus = 0; - so->so_head = head; + so->so_listen = head; so->so_type = head->so_type; - so->so_options = head->so_options &~ SO_ACCEPTCONN; so->so_linger = head->so_linger; so->so_state = head->so_state | SS_NOFDREF; so->so_fibnum = head->so_fibnum; so->so_proto = head->so_proto; so->so_cred = crhold(head->so_cred); #ifdef MAC mac_socket_newconn(head, so); #endif - knlist_init_mtx(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv)); - knlist_init_mtx(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd)); + knlist_init(&so->so_rdsel.si_note, so, so_rdknl_lock, so_rdknl_unlock, + so_rdknl_assert_locked, so_rdknl_assert_unlocked); + knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock, + so_wrknl_assert_locked, so_wrknl_assert_unlocked); VNET_SO_ASSERT(head); - if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat)) { + if (soreserve(so, head->sol_sbsnd_hiwat, head->sol_sbrcv_hiwat)) { sodealloc(so); log(LOG_DEBUG, "%s: pcb %p: soreserve() failed\n", __func__, head->so_pcb); return (NULL); } if ((*so->so_proto->pr_usrreqs->pru_attach)(so, 0, NULL)) { sodealloc(so); log(LOG_DEBUG, "%s: pcb %p: pru_attach() failed\n", __func__, head->so_pcb); return (NULL); } - so->so_rcv.sb_lowat = head->so_rcv.sb_lowat; - so->so_snd.sb_lowat = head->so_snd.sb_lowat; - so->so_rcv.sb_timeo = head->so_rcv.sb_timeo; - so->so_snd.sb_timeo = head->so_snd.sb_timeo; - so->so_rcv.sb_flags |= head->so_rcv.sb_flags & SB_AUTOSIZE; - so->so_snd.sb_flags |= head->so_snd.sb_flags & SB_AUTOSIZE; + so->so_rcv.sb_lowat = head->sol_sbrcv_lowat; + so->so_snd.sb_lowat = head->sol_sbsnd_lowat; + so->so_rcv.sb_timeo = head->sol_sbrcv_timeo; + so->so_snd.sb_timeo = head->sol_sbsnd_timeo; + so->so_rcv.sb_flags |= head->sol_sbrcv_flags & SB_AUTOSIZE; + so->so_snd.sb_flags |= head->sol_sbsnd_flags & SB_AUTOSIZE; + + SOLISTEN_LOCK(head); + if (head->sol_accept_filter != NULL) + connstatus = 0; so->so_state |= connstatus; - ACCEPT_LOCK(); - /* - * The accept socket may be tearing down but we just - * won a race on the ACCEPT_LOCK. - * However, if sctp_peeloff() is called on a 1-to-many - * style socket, the SO_ACCEPTCONN doesn't need to be set. - */ - if (!(head->so_options & SO_ACCEPTCONN) && - ((head->so_proto->pr_protocol != IPPROTO_SCTP) || - (head->so_type != SOCK_SEQPACKET))) { - SOCK_LOCK(so); - so->so_head = NULL; - sofree(so); /* NB: returns ACCEPT_UNLOCK'ed. */ - return (NULL); - } + so->so_options = head->so_options & ~SO_ACCEPTCONN; + soref(head); /* A socket on (in)complete queue refs head. */ if (connstatus) { - TAILQ_INSERT_TAIL(&head->so_comp, so, so_list); - so->so_qstate |= SQ_COMP; - head->so_qlen++; + TAILQ_INSERT_TAIL(&head->sol_comp, so, so_list); + so->so_qstate = SQ_COMP; + head->sol_qlen++; + solisten_wakeup(head); /* unlocks */ } else { /* * Keep removing sockets from the head until there's room for * us to insert on the tail. In pre-locking revisions, this * was a simple if(), but as we could be racing with other * threads and soabort() requires dropping locks, we must * loop waiting for the condition to be true. */ - while (head->so_incqlen > head->so_qlimit) { + while (head->sol_incqlen > head->sol_qlimit) { struct socket *sp; - sp = TAILQ_FIRST(&head->so_incomp); - TAILQ_REMOVE(&head->so_incomp, sp, so_list); - head->so_incqlen--; - sp->so_qstate &= ~SQ_INCOMP; - sp->so_head = NULL; - ACCEPT_UNLOCK(); + + sp = TAILQ_FIRST(&head->sol_incomp); + TAILQ_REMOVE(&head->sol_incomp, sp, so_list); + head->sol_incqlen--; + SOCK_LOCK(sp); + sp->so_qstate = SQ_NONE; + sp->so_listen = NULL; + SOCK_UNLOCK(sp); + sorele(head); /* does SOLISTEN_UNLOCK, head stays */ soabort(sp); - ACCEPT_LOCK(); + SOLISTEN_LOCK(head); } - TAILQ_INSERT_TAIL(&head->so_incomp, so, so_list); - so->so_qstate |= SQ_INCOMP; - head->so_incqlen++; + TAILQ_INSERT_TAIL(&head->sol_incomp, so, so_list); + so->so_qstate = SQ_INCOMP; + head->sol_incqlen++; + SOLISTEN_UNLOCK(head); } - ACCEPT_UNLOCK(); - if (connstatus) { - sorwakeup(head); - wakeup_one(&head->so_timeo); + return (so); +} + +#ifdef SCTP +/* + * Socket part of sctp_peeloff(). Detach a new socket from an + * association. The new socket is returned with a reference. + */ +struct socket * +sopeeloff(struct socket *head) +{ + struct socket *so; + + VNET_ASSERT(head->so_vnet != NULL, ("%s:%d so_vnet is NULL, head=%p", + __func__, __LINE__, head)); + so = soalloc(head->so_vnet); + if (so == NULL) { + log(LOG_DEBUG, "%s: pcb %p: New socket allocation failure: " + "limit reached or out of memory\n", + __func__, head->so_pcb); + return (NULL); } + so->so_type = head->so_type; + so->so_options = head->so_options; + so->so_linger = head->so_linger; + so->so_state = (head->so_state & SS_NBIO) | SS_ISCONNECTED; + so->so_fibnum = head->so_fibnum; + so->so_proto = head->so_proto; + so->so_cred = crhold(head->so_cred); +#ifdef MAC + mac_socket_newconn(head, so); +#endif + knlist_init(&so->so_rdsel.si_note, so, so_rdknl_lock, so_rdknl_unlock, + so_rdknl_assert_locked, so_rdknl_assert_unlocked); + knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock, + so_wrknl_assert_locked, so_wrknl_assert_unlocked); + VNET_SO_ASSERT(head); + if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat)) { + sodealloc(so); + log(LOG_DEBUG, "%s: pcb %p: soreserve() failed\n", + __func__, head->so_pcb); + return (NULL); + } + if ((*so->so_proto->pr_usrreqs->pru_attach)(so, 0, NULL)) { + sodealloc(so); + log(LOG_DEBUG, "%s: pcb %p: pru_attach() failed\n", + __func__, head->so_pcb); + return (NULL); + } + so->so_rcv.sb_lowat = head->so_rcv.sb_lowat; + so->so_snd.sb_lowat = head->so_snd.sb_lowat; + so->so_rcv.sb_timeo = head->so_rcv.sb_timeo; + so->so_snd.sb_timeo = head->so_snd.sb_timeo; + so->so_rcv.sb_flags |= head->so_rcv.sb_flags & SB_AUTOSIZE; + so->so_snd.sb_flags |= head->so_snd.sb_flags & SB_AUTOSIZE; + + soref(so); + return (so); } +#endif /* SCTP */ int sobind(struct socket *so, struct sockaddr *nam, struct thread *td) { int error; CURVNET_SET(so->so_vnet); error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, td); CURVNET_RESTORE(); return (error); } int sobindat(int fd, struct socket *so, struct sockaddr *nam, struct thread *td) { int error; CURVNET_SET(so->so_vnet); error = (*so->so_proto->pr_usrreqs->pru_bindat)(fd, so, nam, td); CURVNET_RESTORE(); return (error); } /* * solisten() transitions a socket from a non-listening state to a listening * state, but can also be used to update the listen queue depth on an * existing listen socket. The protocol will call back into the sockets * layer using solisten_proto_check() and solisten_proto() to check and set * socket-layer listen state. Call backs are used so that the protocol can * acquire both protocol and socket layer locks in whatever order is required * by the protocol. * * Protocol implementors are advised to hold the socket lock across the * socket-layer test and set to avoid races at the socket layer. */ int solisten(struct socket *so, int backlog, struct thread *td) { int error; CURVNET_SET(so->so_vnet); error = (*so->so_proto->pr_usrreqs->pru_listen)(so, backlog, td); CURVNET_RESTORE(); return (error); } int solisten_proto_check(struct socket *so) { SOCK_LOCK_ASSERT(so); if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) return (EINVAL); return (0); } void solisten_proto(struct socket *so, int backlog) { + int sbrcv_lowat, sbsnd_lowat; + u_int sbrcv_hiwat, sbsnd_hiwat; + short sbrcv_flags, sbsnd_flags; + sbintime_t sbrcv_timeo, sbsnd_timeo; SOCK_LOCK_ASSERT(so); + if (SOLISTENING(so)) + goto listening; + + /* + * Change this socket to listening state. + */ + sbrcv_lowat = so->so_rcv.sb_lowat; + sbsnd_lowat = so->so_snd.sb_lowat; + sbrcv_hiwat = so->so_rcv.sb_hiwat; + sbsnd_hiwat = so->so_snd.sb_hiwat; + sbrcv_flags = so->so_rcv.sb_flags; + sbsnd_flags = so->so_snd.sb_flags; + sbrcv_timeo = so->so_rcv.sb_timeo; + sbsnd_timeo = so->so_snd.sb_timeo; + + sbdestroy(&so->so_snd, so); + sbdestroy(&so->so_rcv, so); + sx_destroy(&so->so_snd.sb_sx); + sx_destroy(&so->so_rcv.sb_sx); + SOCKBUF_LOCK_DESTROY(&so->so_snd); + SOCKBUF_LOCK_DESTROY(&so->so_rcv); + +#ifdef INVARIANTS + bzero(&so->so_rcv, + sizeof(struct socket) - offsetof(struct socket, so_rcv)); +#endif + + so->sol_sbrcv_lowat = sbrcv_lowat; + so->sol_sbsnd_lowat = sbsnd_lowat; + so->sol_sbrcv_hiwat = sbrcv_hiwat; + so->sol_sbsnd_hiwat = sbsnd_hiwat; + so->sol_sbrcv_flags = sbrcv_flags; + so->sol_sbsnd_flags = sbsnd_flags; + so->sol_sbrcv_timeo = sbrcv_timeo; + so->sol_sbsnd_timeo = sbsnd_timeo; + + so->sol_qlen = so->sol_incqlen = 0; + TAILQ_INIT(&so->sol_incomp); + TAILQ_INIT(&so->sol_comp); + + so->sol_accept_filter = NULL; + so->sol_accept_filter_arg = NULL; + so->sol_accept_filter_str = NULL; + + so->so_options |= SO_ACCEPTCONN; + +listening: if (backlog < 0 || backlog > somaxconn) backlog = somaxconn; - so->so_qlimit = backlog; - so->so_options |= SO_ACCEPTCONN; + so->sol_qlimit = backlog; } /* + * Wakeup listeners/subsystems once we have a complete connection. + * Enters with lock, returns unlocked. + */ +void +solisten_wakeup(struct socket *sol) +{ + + if (sol->sol_upcall != NULL) + (void )sol->sol_upcall(sol, sol->sol_upcallarg, M_NOWAIT); + else { + selwakeuppri(&sol->so_rdsel, PSOCK); + KNOTE_LOCKED(&sol->so_rdsel.si_note, 0); + } + SOLISTEN_UNLOCK(sol); + wakeup_one(&sol->sol_comp); +} + +/* + * Return single connection off a listening socket queue. Main consumer of + * the function is kern_accept4(). Some modules, that do their own accept + * management also use the function. + * + * Listening socket must be locked on entry and is returned unlocked on + * return. + * The flags argument is set of accept4(2) flags and ACCEPT4_INHERIT. + */ +int +solisten_dequeue(struct socket *head, struct socket **ret, int flags) +{ + struct socket *so; + int error; + + SOLISTEN_LOCK_ASSERT(head); + + while (!(head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->sol_comp) && + head->so_error == 0) { + error = msleep(&head->sol_comp, &head->so_lock, PSOCK | PCATCH, + "accept", 0); + if (error != 0) { + SOLISTEN_UNLOCK(head); + return (error); + } + } + if (head->so_error) { + error = head->so_error; + head->so_error = 0; + SOLISTEN_UNLOCK(head); + return (error); + } + if ((head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->sol_comp)) { + SOLISTEN_UNLOCK(head); + return (EWOULDBLOCK); + } + so = TAILQ_FIRST(&head->sol_comp); + SOCK_LOCK(so); + KASSERT(so->so_qstate == SQ_COMP, + ("%s: so %p not SQ_COMP", __func__, so)); + soref(so); + head->sol_qlen--; + so->so_qstate = SQ_NONE; + so->so_listen = NULL; + TAILQ_REMOVE(&head->sol_comp, so, so_list); + if (flags & ACCEPT4_INHERIT) + so->so_state |= (head->so_state & SS_NBIO); + else + so->so_state |= (flags & SOCK_NONBLOCK) ? SS_NBIO : 0; + SOCK_UNLOCK(so); + sorele(head); + + *ret = so; + return (0); +} + +/* * Evaluate the reference count and named references on a socket; if no * references remain, free it. This should be called whenever a reference is * released, such as in sorele(), but also when named reference flags are * cleared in socket or protocol code. * * sofree() will free the socket if: * * - There are no outstanding file descriptor references or related consumers * (so_count == 0). * * - The socket has been closed by user space, if ever open (SS_NOFDREF). * * - The protocol does not have an outstanding strong reference on the socket * (SS_PROTOREF). * * - The socket is not in a completed connection queue, so a process has been * notified that it is present. If it is removed, the user process may * block in accept() despite select() saying the socket was ready. */ void sofree(struct socket *so) { struct protosw *pr = so->so_proto; - struct socket *head; - ACCEPT_LOCK_ASSERT(); SOCK_LOCK_ASSERT(so); if ((so->so_state & SS_NOFDREF) == 0 || so->so_count != 0 || - (so->so_state & SS_PROTOREF) || (so->so_qstate & SQ_COMP)) { + (so->so_state & SS_PROTOREF) || (so->so_qstate == SQ_COMP)) { SOCK_UNLOCK(so); - ACCEPT_UNLOCK(); return; } - head = so->so_head; - if (head != NULL) { - KASSERT((so->so_qstate & SQ_COMP) != 0 || - (so->so_qstate & SQ_INCOMP) != 0, - ("sofree: so_head != NULL, but neither SQ_COMP nor " - "SQ_INCOMP")); - KASSERT((so->so_qstate & SQ_COMP) == 0 || - (so->so_qstate & SQ_INCOMP) == 0, - ("sofree: so->so_qstate is SQ_COMP and also SQ_INCOMP")); - TAILQ_REMOVE(&head->so_incomp, so, so_list); - head->so_incqlen--; - so->so_qstate &= ~SQ_INCOMP; - so->so_head = NULL; + if (!SOLISTENING(so) && so->so_qstate == SQ_INCOMP) { + struct socket *sol; + + sol = so->so_listen; + KASSERT(sol, ("%s: so %p on incomp of NULL", __func__, so)); + + /* + * To solve race between close of a listening socket and + * a socket on its incomplete queue, we need to lock both. + * The order is first listening socket, then regular. + * Since we don't have SS_NOFDREF neither SS_PROTOREF, this + * function and the listening socket are the only pointers + * to so. To preserve so and sol, we reference both and then + * relock. + * After relock the socket may not move to so_comp since it + * doesn't have PCB already, but it may be removed from + * so_incomp. If that happens, we share responsiblity on + * freeing the socket, but soclose() has already removed + * it from queue. + */ + soref(sol); + soref(so); + SOCK_UNLOCK(so); + SOLISTEN_LOCK(sol); + SOCK_LOCK(so); + if (so->so_qstate == SQ_INCOMP) { + KASSERT(so->so_listen == sol, + ("%s: so %p migrated out of sol %p", + __func__, so, sol)); + TAILQ_REMOVE(&sol->sol_incomp, so, so_list); + sol->sol_incqlen--; + /* This is guarenteed not to be the last. */ + refcount_release(&sol->so_count); + so->so_qstate = SQ_NONE; + so->so_listen = NULL; + } else + KASSERT(so->so_listen == NULL, + ("%s: so %p not on (in)comp with so_listen", + __func__, so)); + sorele(sol); + KASSERT(so->so_count == 1, + ("%s: so %p count %u", __func__, so, so->so_count)); + so->so_count = 0; } - KASSERT((so->so_qstate & SQ_COMP) == 0 && - (so->so_qstate & SQ_INCOMP) == 0, - ("sofree: so_head == NULL, but still SQ_COMP(%d) or SQ_INCOMP(%d)", - so->so_qstate & SQ_COMP, so->so_qstate & SQ_INCOMP)); - if (so->so_options & SO_ACCEPTCONN) { - KASSERT((TAILQ_EMPTY(&so->so_comp)), - ("sofree: so_comp populated")); - KASSERT((TAILQ_EMPTY(&so->so_incomp)), - ("sofree: so_incomp populated")); - } + if (SOLISTENING(so)) + so->so_error = ECONNABORTED; SOCK_UNLOCK(so); - ACCEPT_UNLOCK(); VNET_SO_ASSERT(so); if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL) (*pr->pr_domain->dom_dispose)(so); if (pr->pr_usrreqs->pru_detach != NULL) (*pr->pr_usrreqs->pru_detach)(so); /* * From this point on, we assume that no other references to this * socket exist anywhere else in the stack. Therefore, no locks need * to be acquired or held. * * We used to do a lot of socket buffer and socket locking here, as * well as invoke sorflush() and perform wakeups. The direct call to * dom_dispose() and sbrelease_internal() are an inlining of what was * necessary from sorflush(). * * Notice that the socket buffer and kqueue state are torn down * before calling pru_detach. This means that protocols shold not * assume they can perform socket wakeups, etc, in their detach code. */ - sbdestroy(&so->so_snd, so); - sbdestroy(&so->so_rcv, so); - seldrain(&so->so_snd.sb_sel); - seldrain(&so->so_rcv.sb_sel); - knlist_destroy(&so->so_rcv.sb_sel.si_note); - knlist_destroy(&so->so_snd.sb_sel.si_note); + if (!SOLISTENING(so)) { + sbdestroy(&so->so_snd, so); + sbdestroy(&so->so_rcv, so); + } + seldrain(&so->so_rdsel); + seldrain(&so->so_wrsel); + knlist_destroy(&so->so_rdsel.si_note); + knlist_destroy(&so->so_wrsel.si_note); sodealloc(so); } /* * Close a socket on last file table reference removal. Initiate disconnect * if connected. Free socket when disconnect complete. * * This function will sorele() the socket. Note that soclose() may be called * prior to the ref count reaching zero. The actual socket structure will * not be freed until the ref count reaches zero. */ int soclose(struct socket *so) { + struct accept_queue lqueue; + bool listening; int error = 0; KASSERT(!(so->so_state & SS_NOFDREF), ("soclose: SS_NOFDREF on enter")); CURVNET_SET(so->so_vnet); funsetown(&so->so_sigio); if (so->so_state & SS_ISCONNECTED) { if ((so->so_state & SS_ISDISCONNECTING) == 0) { error = sodisconnect(so); if (error) { if (error == ENOTCONN) error = 0; goto drop; } } if (so->so_options & SO_LINGER) { if ((so->so_state & SS_ISDISCONNECTING) && (so->so_state & SS_NBIO)) goto drop; while (so->so_state & SS_ISCONNECTED) { error = tsleep(&so->so_timeo, PSOCK | PCATCH, "soclos", so->so_linger * hz); if (error) break; } } } drop: if (so->so_proto->pr_usrreqs->pru_close != NULL) (*so->so_proto->pr_usrreqs->pru_close)(so); - ACCEPT_LOCK(); - if (so->so_options & SO_ACCEPTCONN) { + + SOCK_LOCK(so); + if ((listening = (so->so_options & SO_ACCEPTCONN))) { struct socket *sp; - /* - * Prevent new additions to the accept queues due - * to ACCEPT_LOCK races while we are draining them. - */ - so->so_options &= ~SO_ACCEPTCONN; - while ((sp = TAILQ_FIRST(&so->so_incomp)) != NULL) { - TAILQ_REMOVE(&so->so_incomp, sp, so_list); - so->so_incqlen--; - sp->so_qstate &= ~SQ_INCOMP; - sp->so_head = NULL; - ACCEPT_UNLOCK(); - soabort(sp); - ACCEPT_LOCK(); + + TAILQ_INIT(&lqueue); + TAILQ_SWAP(&lqueue, &so->sol_incomp, socket, so_list); + TAILQ_CONCAT(&lqueue, &so->sol_comp, so_list); + + so->sol_qlen = so->sol_incqlen = 0; + + TAILQ_FOREACH(sp, &lqueue, so_list) { + SOCK_LOCK(sp); + sp->so_qstate = SQ_NONE; + sp->so_listen = NULL; + SOCK_UNLOCK(sp); + /* Guaranteed not to be the last. */ + refcount_release(&so->so_count); } - while ((sp = TAILQ_FIRST(&so->so_comp)) != NULL) { - TAILQ_REMOVE(&so->so_comp, sp, so_list); - so->so_qlen--; - sp->so_qstate &= ~SQ_COMP; - sp->so_head = NULL; - ACCEPT_UNLOCK(); - soabort(sp); - ACCEPT_LOCK(); - } - KASSERT((TAILQ_EMPTY(&so->so_comp)), - ("%s: so_comp populated", __func__)); - KASSERT((TAILQ_EMPTY(&so->so_incomp)), - ("%s: so_incomp populated", __func__)); } - SOCK_LOCK(so); KASSERT((so->so_state & SS_NOFDREF) == 0, ("soclose: NOFDREF")); so->so_state |= SS_NOFDREF; - sorele(so); /* NB: Returns with ACCEPT_UNLOCK(). */ + sorele(so); + if (listening) { + struct socket *sp; + + TAILQ_FOREACH(sp, &lqueue, so_list) { + SOCK_LOCK(sp); + if (sp->so_count == 0) { + SOCK_UNLOCK(sp); + soabort(sp); + } else + /* sp is now in sofree() */ + SOCK_UNLOCK(sp); + } + } CURVNET_RESTORE(); return (error); } /* * soabort() is used to abruptly tear down a connection, such as when a * resource limit is reached (listen queue depth exceeded), or if a listen * socket is closed while there are sockets waiting to be accepted. * * This interface is tricky, because it is called on an unreferenced socket, * and must be called only by a thread that has actually removed the socket * from the listen queue it was on, or races with other threads are risked. * * This interface will call into the protocol code, so must not be called * with any socket locks held. Protocols do call it while holding their own * recursible protocol mutexes, but this is something that should be subject * to review in the future. */ void soabort(struct socket *so) { /* * In as much as is possible, assert that no references to this * socket are held. This is not quite the same as asserting that the * current thread is responsible for arranging for no references, but * is as close as we can get for now. */ KASSERT(so->so_count == 0, ("soabort: so_count")); KASSERT((so->so_state & SS_PROTOREF) == 0, ("soabort: SS_PROTOREF")); KASSERT(so->so_state & SS_NOFDREF, ("soabort: !SS_NOFDREF")); - KASSERT((so->so_state & SQ_COMP) == 0, ("soabort: SQ_COMP")); - KASSERT((so->so_state & SQ_INCOMP) == 0, ("soabort: SQ_INCOMP")); + KASSERT(so->so_qstate == SQ_NONE, ("soabort: !SQ_NONE")); VNET_SO_ASSERT(so); if (so->so_proto->pr_usrreqs->pru_abort != NULL) (*so->so_proto->pr_usrreqs->pru_abort)(so); - ACCEPT_LOCK(); SOCK_LOCK(so); sofree(so); } int soaccept(struct socket *so, struct sockaddr **nam) { int error; SOCK_LOCK(so); KASSERT((so->so_state & SS_NOFDREF) != 0, ("soaccept: !NOFDREF")); so->so_state &= ~SS_NOFDREF; SOCK_UNLOCK(so); CURVNET_SET(so->so_vnet); error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam); CURVNET_RESTORE(); return (error); } int soconnect(struct socket *so, struct sockaddr *nam, struct thread *td) { return (soconnectat(AT_FDCWD, so, nam, td)); } int soconnectat(int fd, struct socket *so, struct sockaddr *nam, struct thread *td) { int error; if (so->so_options & SO_ACCEPTCONN) return (EOPNOTSUPP); CURVNET_SET(so->so_vnet); /* * If protocol is connection-based, can only connect once. * Otherwise, if connected, try to disconnect first. This allows * user to disconnect by connecting to, e.g., a null address. */ if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) && ((so->so_proto->pr_flags & PR_CONNREQUIRED) || (error = sodisconnect(so)))) { error = EISCONN; } else { /* * Prevent accumulated error from previous connection from * biting us. */ so->so_error = 0; if (fd == AT_FDCWD) { error = (*so->so_proto->pr_usrreqs->pru_connect)(so, nam, td); } else { error = (*so->so_proto->pr_usrreqs->pru_connectat)(fd, so, nam, td); } } CURVNET_RESTORE(); return (error); } int soconnect2(struct socket *so1, struct socket *so2) { int error; CURVNET_SET(so1->so_vnet); error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2); CURVNET_RESTORE(); return (error); } int sodisconnect(struct socket *so) { int error; if ((so->so_state & SS_ISCONNECTED) == 0) return (ENOTCONN); if (so->so_state & SS_ISDISCONNECTING) return (EALREADY); VNET_SO_ASSERT(so); error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so); return (error); } #define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT) int sosend_dgram(struct socket *so, struct sockaddr *addr, struct uio *uio, struct mbuf *top, struct mbuf *control, int flags, struct thread *td) { long space; ssize_t resid; int clen = 0, error, dontroute; KASSERT(so->so_type == SOCK_DGRAM, ("sosend_dgram: !SOCK_DGRAM")); KASSERT(so->so_proto->pr_flags & PR_ATOMIC, ("sosend_dgram: !PR_ATOMIC")); if (uio != NULL) resid = uio->uio_resid; else resid = top->m_pkthdr.len; /* * In theory resid should be unsigned. However, space must be * signed, as it might be less than 0 if we over-committed, and we * must use a signed comparison of space and resid. On the other * hand, a negative resid causes us to loop sending 0-length * segments to the protocol. */ if (resid < 0) { error = EINVAL; goto out; } dontroute = (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0; if (td != NULL) td->td_ru.ru_msgsnd++; if (control != NULL) clen = control->m_len; SOCKBUF_LOCK(&so->so_snd); if (so->so_snd.sb_state & SBS_CANTSENDMORE) { SOCKBUF_UNLOCK(&so->so_snd); error = EPIPE; goto out; } if (so->so_error) { error = so->so_error; so->so_error = 0; SOCKBUF_UNLOCK(&so->so_snd); goto out; } if ((so->so_state & SS_ISCONNECTED) == 0) { /* * `sendto' and `sendmsg' is allowed on a connection-based * socket if it supports implied connect. Return ENOTCONN if * not connected and no address is supplied. */ if ((so->so_proto->pr_flags & PR_CONNREQUIRED) && (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) { if ((so->so_state & SS_ISCONFIRMING) == 0 && !(resid == 0 && clen != 0)) { SOCKBUF_UNLOCK(&so->so_snd); error = ENOTCONN; goto out; } } else if (addr == NULL) { if (so->so_proto->pr_flags & PR_CONNREQUIRED) error = ENOTCONN; else error = EDESTADDRREQ; SOCKBUF_UNLOCK(&so->so_snd); goto out; } } /* * Do we need MSG_OOB support in SOCK_DGRAM? Signs here may be a * problem and need fixing. */ space = sbspace(&so->so_snd); if (flags & MSG_OOB) space += 1024; space -= clen; SOCKBUF_UNLOCK(&so->so_snd); if (resid > space) { error = EMSGSIZE; goto out; } if (uio == NULL) { resid = 0; if (flags & MSG_EOR) top->m_flags |= M_EOR; } else { /* * Copy the data from userland into a mbuf chain. * If no data is to be copied in, a single empty mbuf * is returned. */ top = m_uiotombuf(uio, M_WAITOK, space, max_hdr, (M_PKTHDR | ((flags & MSG_EOR) ? M_EOR : 0))); if (top == NULL) { error = EFAULT; /* only possible error */ goto out; } space -= resid - uio->uio_resid; resid = uio->uio_resid; } KASSERT(resid == 0, ("sosend_dgram: resid != 0")); /* * XXXRW: Frobbing SO_DONTROUTE here is even worse without sblock * than with. */ if (dontroute) { SOCK_LOCK(so); so->so_options |= SO_DONTROUTE; SOCK_UNLOCK(so); } /* * XXX all the SBS_CANTSENDMORE checks previously done could be out * of date. We could have received a reset packet in an interrupt or * maybe we slept while doing page faults in uiomove() etc. We could * probably recheck again inside the locking protection here, but * there are probably other places that this also happens. We must * rethink this. */ VNET_SO_ASSERT(so); error = (*so->so_proto->pr_usrreqs->pru_send)(so, (flags & MSG_OOB) ? PRUS_OOB : /* * If the user set MSG_EOF, the protocol understands this flag and * nothing left to send then use PRU_SEND_EOF instead of PRU_SEND. */ ((flags & MSG_EOF) && (so->so_proto->pr_flags & PR_IMPLOPCL) && (resid <= 0)) ? PRUS_EOF : /* If there is more to send set PRUS_MORETOCOME */ (flags & MSG_MORETOCOME) || (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0, top, addr, control, td); if (dontroute) { SOCK_LOCK(so); so->so_options &= ~SO_DONTROUTE; SOCK_UNLOCK(so); } clen = 0; control = NULL; top = NULL; out: if (top != NULL) m_freem(top); if (control != NULL) m_freem(control); return (error); } /* * Send on a socket. If send must go all at once and message is larger than * send buffering, then hard error. Lock against other senders. If must go * all at once and not enough room now, then inform user that this would * block and do nothing. Otherwise, if nonblocking, send as much as * possible. The data to be sent is described by "uio" if nonzero, otherwise * by the mbuf chain "top" (which must be null if uio is not). Data provided * in mbuf chain must be small enough to send all at once. * * Returns nonzero on error, timeout or signal; callers must check for short * counts if EINTR/ERESTART are returned. Data and control buffers are freed * on return. */ int sosend_generic(struct socket *so, struct sockaddr *addr, struct uio *uio, struct mbuf *top, struct mbuf *control, int flags, struct thread *td) { long space; ssize_t resid; int clen = 0, error, dontroute; int atomic = sosendallatonce(so) || top; if (uio != NULL) resid = uio->uio_resid; else resid = top->m_pkthdr.len; /* * In theory resid should be unsigned. However, space must be * signed, as it might be less than 0 if we over-committed, and we * must use a signed comparison of space and resid. On the other * hand, a negative resid causes us to loop sending 0-length * segments to the protocol. * * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM * type sockets since that's an error. */ if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) { error = EINVAL; goto out; } dontroute = (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 && (so->so_proto->pr_flags & PR_ATOMIC); if (td != NULL) td->td_ru.ru_msgsnd++; if (control != NULL) clen = control->m_len; error = sblock(&so->so_snd, SBLOCKWAIT(flags)); if (error) goto out; restart: do { SOCKBUF_LOCK(&so->so_snd); if (so->so_snd.sb_state & SBS_CANTSENDMORE) { SOCKBUF_UNLOCK(&so->so_snd); error = EPIPE; goto release; } if (so->so_error) { error = so->so_error; so->so_error = 0; SOCKBUF_UNLOCK(&so->so_snd); goto release; } if ((so->so_state & SS_ISCONNECTED) == 0) { /* * `sendto' and `sendmsg' is allowed on a connection- * based socket if it supports implied connect. * Return ENOTCONN if not connected and no address is * supplied. */ if ((so->so_proto->pr_flags & PR_CONNREQUIRED) && (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) { if ((so->so_state & SS_ISCONFIRMING) == 0 && !(resid == 0 && clen != 0)) { SOCKBUF_UNLOCK(&so->so_snd); error = ENOTCONN; goto release; } } else if (addr == NULL) { SOCKBUF_UNLOCK(&so->so_snd); if (so->so_proto->pr_flags & PR_CONNREQUIRED) error = ENOTCONN; else error = EDESTADDRREQ; goto release; } } space = sbspace(&so->so_snd); if (flags & MSG_OOB) space += 1024; if ((atomic && resid > so->so_snd.sb_hiwat) || clen > so->so_snd.sb_hiwat) { SOCKBUF_UNLOCK(&so->so_snd); error = EMSGSIZE; goto release; } if (space < resid + clen && (atomic || space < so->so_snd.sb_lowat || space < clen)) { if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO)) { SOCKBUF_UNLOCK(&so->so_snd); error = EWOULDBLOCK; goto release; } error = sbwait(&so->so_snd); SOCKBUF_UNLOCK(&so->so_snd); if (error) goto release; goto restart; } SOCKBUF_UNLOCK(&so->so_snd); space -= clen; do { if (uio == NULL) { resid = 0; if (flags & MSG_EOR) top->m_flags |= M_EOR; } else { /* * Copy the data from userland into a mbuf * chain. If resid is 0, which can happen * only if we have control to send, then * a single empty mbuf is returned. This * is a workaround to prevent protocol send * methods to panic. */ top = m_uiotombuf(uio, M_WAITOK, space, (atomic ? max_hdr : 0), (atomic ? M_PKTHDR : 0) | ((flags & MSG_EOR) ? M_EOR : 0)); if (top == NULL) { error = EFAULT; /* only possible error */ goto release; } space -= resid - uio->uio_resid; resid = uio->uio_resid; } if (dontroute) { SOCK_LOCK(so); so->so_options |= SO_DONTROUTE; SOCK_UNLOCK(so); } /* * XXX all the SBS_CANTSENDMORE checks previously * done could be out of date. We could have received * a reset packet in an interrupt or maybe we slept * while doing page faults in uiomove() etc. We * could probably recheck again inside the locking * protection here, but there are probably other * places that this also happens. We must rethink * this. */ VNET_SO_ASSERT(so); error = (*so->so_proto->pr_usrreqs->pru_send)(so, (flags & MSG_OOB) ? PRUS_OOB : /* * If the user set MSG_EOF, the protocol understands * this flag and nothing left to send then use * PRU_SEND_EOF instead of PRU_SEND. */ ((flags & MSG_EOF) && (so->so_proto->pr_flags & PR_IMPLOPCL) && (resid <= 0)) ? PRUS_EOF : /* If there is more to send set PRUS_MORETOCOME. */ (flags & MSG_MORETOCOME) || (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0, top, addr, control, td); if (dontroute) { SOCK_LOCK(so); so->so_options &= ~SO_DONTROUTE; SOCK_UNLOCK(so); } clen = 0; control = NULL; top = NULL; if (error) goto release; } while (resid && space > 0); } while (resid); release: sbunlock(&so->so_snd); out: if (top != NULL) m_freem(top); if (control != NULL) m_freem(control); return (error); } int sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, struct mbuf *top, struct mbuf *control, int flags, struct thread *td) { int error; CURVNET_SET(so->so_vnet); error = so->so_proto->pr_usrreqs->pru_sosend(so, addr, uio, top, control, flags, td); CURVNET_RESTORE(); return (error); } /* * The part of soreceive() that implements reading non-inline out-of-band * data from a socket. For more complete comments, see soreceive(), from * which this code originated. * * Note that soreceive_rcvoob(), unlike the remainder of soreceive(), is * unable to return an mbuf chain to the caller. */ static int soreceive_rcvoob(struct socket *so, struct uio *uio, int flags) { struct protosw *pr = so->so_proto; struct mbuf *m; int error; KASSERT(flags & MSG_OOB, ("soreceive_rcvoob: (flags & MSG_OOB) == 0")); VNET_SO_ASSERT(so); m = m_get(M_WAITOK, MT_DATA); error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK); if (error) goto bad; do { error = uiomove(mtod(m, void *), (int) min(uio->uio_resid, m->m_len), uio); m = m_free(m); } while (uio->uio_resid && error == 0 && m); bad: if (m != NULL) m_freem(m); return (error); } /* * Following replacement or removal of the first mbuf on the first mbuf chain * of a socket buffer, push necessary state changes back into the socket * buffer so that other consumers see the values consistently. 'nextrecord' * is the callers locally stored value of the original value of * sb->sb_mb->m_nextpkt which must be restored when the lead mbuf changes. * NOTE: 'nextrecord' may be NULL. */ static __inline void sockbuf_pushsync(struct sockbuf *sb, struct mbuf *nextrecord) { SOCKBUF_LOCK_ASSERT(sb); /* * First, update for the new value of nextrecord. If necessary, make * it the first record. */ if (sb->sb_mb != NULL) sb->sb_mb->m_nextpkt = nextrecord; else sb->sb_mb = nextrecord; /* * Now update any dependent socket buffer fields to reflect the new * state. This is an expanded inline of SB_EMPTY_FIXUP(), with the * addition of a second clause that takes care of the case where * sb_mb has been updated, but remains the last record. */ if (sb->sb_mb == NULL) { sb->sb_mbtail = NULL; sb->sb_lastrecord = NULL; } else if (sb->sb_mb->m_nextpkt == NULL) sb->sb_lastrecord = sb->sb_mb; } /* * Implement receive operations on a socket. We depend on the way that * records are added to the sockbuf by sbappend. In particular, each record * (mbufs linked through m_next) must begin with an address if the protocol * so specifies, followed by an optional mbuf or mbufs containing ancillary * data, and then zero or more mbufs of data. In order to allow parallelism * between network receive and copying to user space, as well as avoid * sleeping with a mutex held, we release the socket buffer mutex during the * user space copy. Although the sockbuf is locked, new data may still be * appended, and thus we must maintain consistency of the sockbuf during that * time. * * The caller may receive the data as a single mbuf chain by supplying an * mbuf **mp0 for use in returning the chain. The uio is then used only for * the count in uio_resid. */ int soreceive_generic(struct socket *so, struct sockaddr **psa, struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp) { struct mbuf *m, **mp; int flags, error, offset; ssize_t len; struct protosw *pr = so->so_proto; struct mbuf *nextrecord; int moff, type = 0; ssize_t orig_resid = uio->uio_resid; mp = mp0; if (psa != NULL) *psa = NULL; if (controlp != NULL) *controlp = NULL; if (flagsp != NULL) flags = *flagsp &~ MSG_EOR; else flags = 0; if (flags & MSG_OOB) return (soreceive_rcvoob(so, uio, flags)); if (mp != NULL) *mp = NULL; if ((pr->pr_flags & PR_WANTRCVD) && (so->so_state & SS_ISCONFIRMING) && uio->uio_resid) { VNET_SO_ASSERT(so); (*pr->pr_usrreqs->pru_rcvd)(so, 0); } error = sblock(&so->so_rcv, SBLOCKWAIT(flags)); if (error) return (error); restart: SOCKBUF_LOCK(&so->so_rcv); m = so->so_rcv.sb_mb; /* * If we have less data than requested, block awaiting more (subject * to any timeout) if: * 1. the current count is less than the low water mark, or * 2. MSG_DONTWAIT is not set */ if (m == NULL || (((flags & MSG_DONTWAIT) == 0 && sbavail(&so->so_rcv) < uio->uio_resid) && sbavail(&so->so_rcv) < so->so_rcv.sb_lowat && m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) { KASSERT(m != NULL || !sbavail(&so->so_rcv), ("receive: m == %p sbavail == %u", m, sbavail(&so->so_rcv))); if (so->so_error) { if (m != NULL) goto dontblock; error = so->so_error; if ((flags & MSG_PEEK) == 0) so->so_error = 0; SOCKBUF_UNLOCK(&so->so_rcv); goto release; } SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { if (m == NULL) { SOCKBUF_UNLOCK(&so->so_rcv); goto release; } else goto dontblock; } for (; m != NULL; m = m->m_next) if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) { m = so->so_rcv.sb_mb; goto dontblock; } if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 && (so->so_proto->pr_flags & PR_CONNREQUIRED)) { SOCKBUF_UNLOCK(&so->so_rcv); error = ENOTCONN; goto release; } if (uio->uio_resid == 0) { SOCKBUF_UNLOCK(&so->so_rcv); goto release; } if ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO))) { SOCKBUF_UNLOCK(&so->so_rcv); error = EWOULDBLOCK; goto release; } SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); error = sbwait(&so->so_rcv); SOCKBUF_UNLOCK(&so->so_rcv); if (error) goto release; goto restart; } dontblock: /* * From this point onward, we maintain 'nextrecord' as a cache of the * pointer to the next record in the socket buffer. We must keep the * various socket buffer pointers and local stack versions of the * pointers in sync, pushing out modifications before dropping the * socket buffer mutex, and re-reading them when picking it up. * * Otherwise, we will race with the network stack appending new data * or records onto the socket buffer by using inconsistent/stale * versions of the field, possibly resulting in socket buffer * corruption. * * By holding the high-level sblock(), we prevent simultaneous * readers from pulling off the front of the socket buffer. */ SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (uio->uio_td) uio->uio_td->td_ru.ru_msgrcv++; KASSERT(m == so->so_rcv.sb_mb, ("soreceive: m != so->so_rcv.sb_mb")); SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); nextrecord = m->m_nextpkt; if (pr->pr_flags & PR_ADDR) { KASSERT(m->m_type == MT_SONAME, ("m->m_type == %d", m->m_type)); orig_resid = 0; if (psa != NULL) *psa = sodupsockaddr(mtod(m, struct sockaddr *), M_NOWAIT); if (flags & MSG_PEEK) { m = m->m_next; } else { sbfree(&so->so_rcv, m); so->so_rcv.sb_mb = m_free(m); m = so->so_rcv.sb_mb; sockbuf_pushsync(&so->so_rcv, nextrecord); } } /* * Process one or more MT_CONTROL mbufs present before any data mbufs * in the first mbuf chain on the socket buffer. If MSG_PEEK, we * just copy the data; if !MSG_PEEK, we call into the protocol to * perform externalization (or freeing if controlp == NULL). */ if (m != NULL && m->m_type == MT_CONTROL) { struct mbuf *cm = NULL, *cmn; struct mbuf **cme = &cm; do { if (flags & MSG_PEEK) { if (controlp != NULL) { *controlp = m_copym(m, 0, m->m_len, M_NOWAIT); controlp = &(*controlp)->m_next; } m = m->m_next; } else { sbfree(&so->so_rcv, m); so->so_rcv.sb_mb = m->m_next; m->m_next = NULL; *cme = m; cme = &(*cme)->m_next; m = so->so_rcv.sb_mb; } } while (m != NULL && m->m_type == MT_CONTROL); if ((flags & MSG_PEEK) == 0) sockbuf_pushsync(&so->so_rcv, nextrecord); while (cm != NULL) { cmn = cm->m_next; cm->m_next = NULL; if (pr->pr_domain->dom_externalize != NULL) { SOCKBUF_UNLOCK(&so->so_rcv); VNET_SO_ASSERT(so); error = (*pr->pr_domain->dom_externalize) (cm, controlp, flags); SOCKBUF_LOCK(&so->so_rcv); } else if (controlp != NULL) *controlp = cm; else m_freem(cm); if (controlp != NULL) { orig_resid = 0; while (*controlp != NULL) controlp = &(*controlp)->m_next; } cm = cmn; } if (m != NULL) nextrecord = so->so_rcv.sb_mb->m_nextpkt; else nextrecord = so->so_rcv.sb_mb; orig_resid = 0; } if (m != NULL) { if ((flags & MSG_PEEK) == 0) { KASSERT(m->m_nextpkt == nextrecord, ("soreceive: post-control, nextrecord !sync")); if (nextrecord == NULL) { KASSERT(so->so_rcv.sb_mb == m, ("soreceive: post-control, sb_mb!=m")); KASSERT(so->so_rcv.sb_lastrecord == m, ("soreceive: post-control, lastrecord!=m")); } } type = m->m_type; if (type == MT_OOBDATA) flags |= MSG_OOB; } else { if ((flags & MSG_PEEK) == 0) { KASSERT(so->so_rcv.sb_mb == nextrecord, ("soreceive: sb_mb != nextrecord")); if (so->so_rcv.sb_mb == NULL) { KASSERT(so->so_rcv.sb_lastrecord == NULL, ("soreceive: sb_lastercord != NULL")); } } } SOCKBUF_LOCK_ASSERT(&so->so_rcv); SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); /* * Now continue to read any data mbufs off of the head of the socket * buffer until the read request is satisfied. Note that 'type' is * used to store the type of any mbuf reads that have happened so far * such that soreceive() can stop reading if the type changes, which * causes soreceive() to return only one of regular data and inline * out-of-band data in a single socket receive operation. */ moff = 0; offset = 0; while (m != NULL && !(m->m_flags & M_NOTAVAIL) && uio->uio_resid > 0 && error == 0) { /* * If the type of mbuf has changed since the last mbuf * examined ('type'), end the receive operation. */ SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (m->m_type == MT_OOBDATA || m->m_type == MT_CONTROL) { if (type != m->m_type) break; } else if (type == MT_OOBDATA) break; else KASSERT(m->m_type == MT_DATA, ("m->m_type == %d", m->m_type)); so->so_rcv.sb_state &= ~SBS_RCVATMARK; len = uio->uio_resid; if (so->so_oobmark && len > so->so_oobmark - offset) len = so->so_oobmark - offset; if (len > m->m_len - moff) len = m->m_len - moff; /* * If mp is set, just pass back the mbufs. Otherwise copy * them out via the uio, then free. Sockbuf must be * consistent here (points to current mbuf, it points to next * record) when we drop priority; we must note any additions * to the sockbuf when we block interrupts again. */ if (mp == NULL) { SOCKBUF_LOCK_ASSERT(&so->so_rcv); SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); SOCKBUF_UNLOCK(&so->so_rcv); error = uiomove(mtod(m, char *) + moff, (int)len, uio); SOCKBUF_LOCK(&so->so_rcv); if (error) { /* * The MT_SONAME mbuf has already been removed * from the record, so it is necessary to * remove the data mbufs, if any, to preserve * the invariant in the case of PR_ADDR that * requires MT_SONAME mbufs at the head of * each record. */ if (pr->pr_flags & PR_ATOMIC && ((flags & MSG_PEEK) == 0)) (void)sbdroprecord_locked(&so->so_rcv); SOCKBUF_UNLOCK(&so->so_rcv); goto release; } } else uio->uio_resid -= len; SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (len == m->m_len - moff) { if (m->m_flags & M_EOR) flags |= MSG_EOR; if (flags & MSG_PEEK) { m = m->m_next; moff = 0; } else { nextrecord = m->m_nextpkt; sbfree(&so->so_rcv, m); if (mp != NULL) { m->m_nextpkt = NULL; *mp = m; mp = &m->m_next; so->so_rcv.sb_mb = m = m->m_next; *mp = NULL; } else { so->so_rcv.sb_mb = m_free(m); m = so->so_rcv.sb_mb; } sockbuf_pushsync(&so->so_rcv, nextrecord); SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); } } else { if (flags & MSG_PEEK) moff += len; else { if (mp != NULL) { if (flags & MSG_DONTWAIT) { *mp = m_copym(m, 0, len, M_NOWAIT); if (*mp == NULL) { /* * m_copym() couldn't * allocate an mbuf. * Adjust uio_resid back * (it was adjusted * down by len bytes, * which we didn't end * up "copying" over). */ uio->uio_resid += len; break; } } else { SOCKBUF_UNLOCK(&so->so_rcv); *mp = m_copym(m, 0, len, M_WAITOK); SOCKBUF_LOCK(&so->so_rcv); } } sbcut_locked(&so->so_rcv, len); } } SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (so->so_oobmark) { if ((flags & MSG_PEEK) == 0) { so->so_oobmark -= len; if (so->so_oobmark == 0) { so->so_rcv.sb_state |= SBS_RCVATMARK; break; } } else { offset += len; if (offset == so->so_oobmark) break; } } if (flags & MSG_EOR) break; /* * If the MSG_WAITALL flag is set (for non-atomic socket), we * must not quit until "uio->uio_resid == 0" or an error * termination. If a signal/timeout occurs, return with a * short count but without error. Keep sockbuf locked * against other readers. */ while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 && !sosendallatonce(so) && nextrecord == NULL) { SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (so->so_error || so->so_rcv.sb_state & SBS_CANTRCVMORE) break; /* * Notify the protocol that some data has been * drained before blocking. */ if (pr->pr_flags & PR_WANTRCVD) { SOCKBUF_UNLOCK(&so->so_rcv); VNET_SO_ASSERT(so); (*pr->pr_usrreqs->pru_rcvd)(so, flags); SOCKBUF_LOCK(&so->so_rcv); } SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); /* * We could receive some data while was notifying * the protocol. Skip blocking in this case. */ if (so->so_rcv.sb_mb == NULL) { error = sbwait(&so->so_rcv); if (error) { SOCKBUF_UNLOCK(&so->so_rcv); goto release; } } m = so->so_rcv.sb_mb; if (m != NULL) nextrecord = m->m_nextpkt; } } SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (m != NULL && pr->pr_flags & PR_ATOMIC) { flags |= MSG_TRUNC; if ((flags & MSG_PEEK) == 0) (void) sbdroprecord_locked(&so->so_rcv); } if ((flags & MSG_PEEK) == 0) { if (m == NULL) { /* * First part is an inline SB_EMPTY_FIXUP(). Second * part makes sure sb_lastrecord is up-to-date if * there is still data in the socket buffer. */ so->so_rcv.sb_mb = nextrecord; if (so->so_rcv.sb_mb == NULL) { so->so_rcv.sb_mbtail = NULL; so->so_rcv.sb_lastrecord = NULL; } else if (nextrecord->m_nextpkt == NULL) so->so_rcv.sb_lastrecord = nextrecord; } SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); /* * If soreceive() is being done from the socket callback, * then don't need to generate ACK to peer to update window, * since ACK will be generated on return to TCP. */ if (!(flags & MSG_SOCALLBCK) && (pr->pr_flags & PR_WANTRCVD)) { SOCKBUF_UNLOCK(&so->so_rcv); VNET_SO_ASSERT(so); (*pr->pr_usrreqs->pru_rcvd)(so, flags); SOCKBUF_LOCK(&so->so_rcv); } } SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (orig_resid == uio->uio_resid && orig_resid && (flags & MSG_EOR) == 0 && (so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) { SOCKBUF_UNLOCK(&so->so_rcv); goto restart; } SOCKBUF_UNLOCK(&so->so_rcv); if (flagsp != NULL) *flagsp |= flags; release: sbunlock(&so->so_rcv); return (error); } /* * Optimized version of soreceive() for stream (TCP) sockets. * XXXAO: (MSG_WAITALL | MSG_PEEK) isn't properly handled. */ int soreceive_stream(struct socket *so, struct sockaddr **psa, struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp) { int len = 0, error = 0, flags, oresid; struct sockbuf *sb; struct mbuf *m, *n = NULL; /* We only do stream sockets. */ if (so->so_type != SOCK_STREAM) return (EINVAL); if (psa != NULL) *psa = NULL; if (controlp != NULL) return (EINVAL); if (flagsp != NULL) flags = *flagsp &~ MSG_EOR; else flags = 0; if (flags & MSG_OOB) return (soreceive_rcvoob(so, uio, flags)); if (mp0 != NULL) *mp0 = NULL; sb = &so->so_rcv; /* Prevent other readers from entering the socket. */ error = sblock(sb, SBLOCKWAIT(flags)); if (error) goto out; SOCKBUF_LOCK(sb); /* Easy one, no space to copyout anything. */ if (uio->uio_resid == 0) { error = EINVAL; goto out; } oresid = uio->uio_resid; /* We will never ever get anything unless we are or were connected. */ if (!(so->so_state & (SS_ISCONNECTED|SS_ISDISCONNECTED))) { error = ENOTCONN; goto out; } restart: SOCKBUF_LOCK_ASSERT(&so->so_rcv); /* Abort if socket has reported problems. */ if (so->so_error) { if (sbavail(sb) > 0) goto deliver; if (oresid > uio->uio_resid) goto out; error = so->so_error; if (!(flags & MSG_PEEK)) so->so_error = 0; goto out; } /* Door is closed. Deliver what is left, if any. */ if (sb->sb_state & SBS_CANTRCVMORE) { if (sbavail(sb) > 0) goto deliver; else goto out; } /* Socket buffer is empty and we shall not block. */ if (sbavail(sb) == 0 && ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO)))) { error = EAGAIN; goto out; } /* Socket buffer got some data that we shall deliver now. */ if (sbavail(sb) > 0 && !(flags & MSG_WAITALL) && ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO)) || sbavail(sb) >= sb->sb_lowat || sbavail(sb) >= uio->uio_resid || sbavail(sb) >= sb->sb_hiwat) ) { goto deliver; } /* On MSG_WAITALL we must wait until all data or error arrives. */ if ((flags & MSG_WAITALL) && (sbavail(sb) >= uio->uio_resid || sbavail(sb) >= sb->sb_hiwat)) goto deliver; /* * Wait and block until (more) data comes in. * NB: Drops the sockbuf lock during wait. */ error = sbwait(sb); if (error) goto out; goto restart; deliver: SOCKBUF_LOCK_ASSERT(&so->so_rcv); KASSERT(sbavail(sb) > 0, ("%s: sockbuf empty", __func__)); KASSERT(sb->sb_mb != NULL, ("%s: sb_mb == NULL", __func__)); /* Statistics. */ if (uio->uio_td) uio->uio_td->td_ru.ru_msgrcv++; /* Fill uio until full or current end of socket buffer is reached. */ len = min(uio->uio_resid, sbavail(sb)); if (mp0 != NULL) { /* Dequeue as many mbufs as possible. */ if (!(flags & MSG_PEEK) && len >= sb->sb_mb->m_len) { if (*mp0 == NULL) *mp0 = sb->sb_mb; else m_cat(*mp0, sb->sb_mb); for (m = sb->sb_mb; m != NULL && m->m_len <= len; m = m->m_next) { KASSERT(!(m->m_flags & M_NOTAVAIL), ("%s: m %p not available", __func__, m)); len -= m->m_len; uio->uio_resid -= m->m_len; sbfree(sb, m); n = m; } n->m_next = NULL; sb->sb_mb = m; sb->sb_lastrecord = sb->sb_mb; if (sb->sb_mb == NULL) SB_EMPTY_FIXUP(sb); } /* Copy the remainder. */ if (len > 0) { KASSERT(sb->sb_mb != NULL, ("%s: len > 0 && sb->sb_mb empty", __func__)); m = m_copym(sb->sb_mb, 0, len, M_NOWAIT); if (m == NULL) len = 0; /* Don't flush data from sockbuf. */ else uio->uio_resid -= len; if (*mp0 != NULL) m_cat(*mp0, m); else *mp0 = m; if (*mp0 == NULL) { error = ENOBUFS; goto out; } } } else { /* NB: Must unlock socket buffer as uiomove may sleep. */ SOCKBUF_UNLOCK(sb); error = m_mbuftouio(uio, sb->sb_mb, len); SOCKBUF_LOCK(sb); if (error) goto out; } SBLASTRECORDCHK(sb); SBLASTMBUFCHK(sb); /* * Remove the delivered data from the socket buffer unless we * were only peeking. */ if (!(flags & MSG_PEEK)) { if (len > 0) sbdrop_locked(sb, len); /* Notify protocol that we drained some data. */ if ((so->so_proto->pr_flags & PR_WANTRCVD) && (((flags & MSG_WAITALL) && uio->uio_resid > 0) || !(flags & MSG_SOCALLBCK))) { SOCKBUF_UNLOCK(sb); VNET_SO_ASSERT(so); (*so->so_proto->pr_usrreqs->pru_rcvd)(so, flags); SOCKBUF_LOCK(sb); } } /* * For MSG_WAITALL we may have to loop again and wait for * more data to come in. */ if ((flags & MSG_WAITALL) && uio->uio_resid > 0) goto restart; out: SOCKBUF_LOCK_ASSERT(sb); SBLASTRECORDCHK(sb); SBLASTMBUFCHK(sb); SOCKBUF_UNLOCK(sb); sbunlock(sb); return (error); } /* * Optimized version of soreceive() for simple datagram cases from userspace. * Unlike in the stream case, we're able to drop a datagram if copyout() * fails, and because we handle datagrams atomically, we don't need to use a * sleep lock to prevent I/O interlacing. */ int soreceive_dgram(struct socket *so, struct sockaddr **psa, struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp) { struct mbuf *m, *m2; int flags, error; ssize_t len; struct protosw *pr = so->so_proto; struct mbuf *nextrecord; if (psa != NULL) *psa = NULL; if (controlp != NULL) *controlp = NULL; if (flagsp != NULL) flags = *flagsp &~ MSG_EOR; else flags = 0; /* * For any complicated cases, fall back to the full * soreceive_generic(). */ if (mp0 != NULL || (flags & MSG_PEEK) || (flags & MSG_OOB)) return (soreceive_generic(so, psa, uio, mp0, controlp, flagsp)); /* * Enforce restrictions on use. */ KASSERT((pr->pr_flags & PR_WANTRCVD) == 0, ("soreceive_dgram: wantrcvd")); KASSERT(pr->pr_flags & PR_ATOMIC, ("soreceive_dgram: !atomic")); KASSERT((so->so_rcv.sb_state & SBS_RCVATMARK) == 0, ("soreceive_dgram: SBS_RCVATMARK")); KASSERT((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0, ("soreceive_dgram: P_CONNREQUIRED")); /* * Loop blocking while waiting for a datagram. */ SOCKBUF_LOCK(&so->so_rcv); while ((m = so->so_rcv.sb_mb) == NULL) { KASSERT(sbavail(&so->so_rcv) == 0, ("soreceive_dgram: sb_mb NULL but sbavail %u", sbavail(&so->so_rcv))); if (so->so_error) { error = so->so_error; so->so_error = 0; SOCKBUF_UNLOCK(&so->so_rcv); return (error); } if (so->so_rcv.sb_state & SBS_CANTRCVMORE || uio->uio_resid == 0) { SOCKBUF_UNLOCK(&so->so_rcv); return (0); } if ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO))) { SOCKBUF_UNLOCK(&so->so_rcv); return (EWOULDBLOCK); } SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); error = sbwait(&so->so_rcv); if (error) { SOCKBUF_UNLOCK(&so->so_rcv); return (error); } } SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (uio->uio_td) uio->uio_td->td_ru.ru_msgrcv++; SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); nextrecord = m->m_nextpkt; if (nextrecord == NULL) { KASSERT(so->so_rcv.sb_lastrecord == m, ("soreceive_dgram: lastrecord != m")); } KASSERT(so->so_rcv.sb_mb->m_nextpkt == nextrecord, ("soreceive_dgram: m_nextpkt != nextrecord")); /* * Pull 'm' and its chain off the front of the packet queue. */ so->so_rcv.sb_mb = NULL; sockbuf_pushsync(&so->so_rcv, nextrecord); /* * Walk 'm's chain and free that many bytes from the socket buffer. */ for (m2 = m; m2 != NULL; m2 = m2->m_next) sbfree(&so->so_rcv, m2); /* * Do a few last checks before we let go of the lock. */ SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); SOCKBUF_UNLOCK(&so->so_rcv); if (pr->pr_flags & PR_ADDR) { KASSERT(m->m_type == MT_SONAME, ("m->m_type == %d", m->m_type)); if (psa != NULL) *psa = sodupsockaddr(mtod(m, struct sockaddr *), M_NOWAIT); m = m_free(m); } if (m == NULL) { /* XXXRW: Can this happen? */ return (0); } /* * Packet to copyout() is now in 'm' and it is disconnected from the * queue. * * Process one or more MT_CONTROL mbufs present before any data mbufs * in the first mbuf chain on the socket buffer. We call into the * protocol to perform externalization (or freeing if controlp == * NULL). In some cases there can be only MT_CONTROL mbufs without * MT_DATA mbufs. */ if (m->m_type == MT_CONTROL) { struct mbuf *cm = NULL, *cmn; struct mbuf **cme = &cm; do { m2 = m->m_next; m->m_next = NULL; *cme = m; cme = &(*cme)->m_next; m = m2; } while (m != NULL && m->m_type == MT_CONTROL); while (cm != NULL) { cmn = cm->m_next; cm->m_next = NULL; if (pr->pr_domain->dom_externalize != NULL) { error = (*pr->pr_domain->dom_externalize) (cm, controlp, flags); } else if (controlp != NULL) *controlp = cm; else m_freem(cm); if (controlp != NULL) { while (*controlp != NULL) controlp = &(*controlp)->m_next; } cm = cmn; } } KASSERT(m == NULL || m->m_type == MT_DATA, ("soreceive_dgram: !data")); while (m != NULL && uio->uio_resid > 0) { len = uio->uio_resid; if (len > m->m_len) len = m->m_len; error = uiomove(mtod(m, char *), (int)len, uio); if (error) { m_freem(m); return (error); } if (len == m->m_len) m = m_free(m); else { m->m_data += len; m->m_len -= len; } } if (m != NULL) { flags |= MSG_TRUNC; m_freem(m); } if (flagsp != NULL) *flagsp |= flags; return (0); } int soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp) { int error; CURVNET_SET(so->so_vnet); error = (so->so_proto->pr_usrreqs->pru_soreceive(so, psa, uio, mp0, controlp, flagsp)); CURVNET_RESTORE(); return (error); } int soshutdown(struct socket *so, int how) { struct protosw *pr = so->so_proto; int error, soerror_enotconn; if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR)) return (EINVAL); soerror_enotconn = 0; if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) == 0) { /* * POSIX mandates us to return ENOTCONN when shutdown(2) is * invoked on a datagram sockets, however historically we would * actually tear socket down. This is known to be leveraged by * some applications to unblock process waiting in recvXXX(2) * by other process that it shares that socket with. Try to meet * both backward-compatibility and POSIX requirements by forcing * ENOTCONN but still asking protocol to perform pru_shutdown(). */ if (so->so_type != SOCK_DGRAM) return (ENOTCONN); soerror_enotconn = 1; } CURVNET_SET(so->so_vnet); if (pr->pr_usrreqs->pru_flush != NULL) (*pr->pr_usrreqs->pru_flush)(so, how); if (how != SHUT_WR) sorflush(so); if (how != SHUT_RD) { error = (*pr->pr_usrreqs->pru_shutdown)(so); wakeup(&so->so_timeo); CURVNET_RESTORE(); return ((error == 0 && soerror_enotconn) ? ENOTCONN : error); } wakeup(&so->so_timeo); CURVNET_RESTORE(); return (soerror_enotconn ? ENOTCONN : 0); } void sorflush(struct socket *so) { struct sockbuf *sb = &so->so_rcv; struct protosw *pr = so->so_proto; struct socket aso; VNET_SO_ASSERT(so); /* * In order to avoid calling dom_dispose with the socket buffer mutex * held, and in order to generally avoid holding the lock for a long * time, we make a copy of the socket buffer and clear the original * (except locks, state). The new socket buffer copy won't have * initialized locks so we can only call routines that won't use or * assert those locks. * * Dislodge threads currently blocked in receive and wait to acquire * a lock against other simultaneous readers before clearing the * socket buffer. Don't let our acquire be interrupted by a signal * despite any existing socket disposition on interruptable waiting. */ socantrcvmore(so); (void) sblock(sb, SBL_WAIT | SBL_NOINTR); /* * Invalidate/clear most of the sockbuf structure, but leave selinfo * and mutex data unchanged. */ SOCKBUF_LOCK(sb); bzero(&aso, sizeof(aso)); aso.so_pcb = so->so_pcb; bcopy(&sb->sb_startzero, &aso.so_rcv.sb_startzero, sizeof(*sb) - offsetof(struct sockbuf, sb_startzero)); bzero(&sb->sb_startzero, sizeof(*sb) - offsetof(struct sockbuf, sb_startzero)); SOCKBUF_UNLOCK(sb); sbunlock(sb); /* * Dispose of special rights and flush the copied socket. Don't call * any unsafe routines (that rely on locks being initialized) on aso. */ if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL) (*pr->pr_domain->dom_dispose)(&aso); sbrelease_internal(&aso.so_rcv, so); } /* * Wrapper for Socket established helper hook. * Parameters: socket, context of the hook point, hook id. */ static int inline hhook_run_socket(struct socket *so, void *hctx, int32_t h_id) { struct socket_hhook_data hhook_data = { .so = so, .hctx = hctx, .m = NULL, .status = 0 }; CURVNET_SET(so->so_vnet); HHOOKS_RUN_IF(V_socket_hhh[h_id], &hhook_data, &so->osd); CURVNET_RESTORE(); /* Ugly but needed, since hhooks return void for now */ return (hhook_data.status); } /* * Perhaps this routine, and sooptcopyout(), below, ought to come in an * additional variant to handle the case where the option value needs to be * some kind of integer, but not a specific size. In addition to their use * here, these functions are also called by the protocol-level pr_ctloutput() * routines. */ int sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen) { size_t valsize; /* * If the user gives us more than we wanted, we ignore it, but if we * don't get the minimum length the caller wants, we return EINVAL. * On success, sopt->sopt_valsize is set to however much we actually * retrieved. */ if ((valsize = sopt->sopt_valsize) < minlen) return EINVAL; if (valsize > len) sopt->sopt_valsize = valsize = len; if (sopt->sopt_td != NULL) return (copyin(sopt->sopt_val, buf, valsize)); bcopy(sopt->sopt_val, buf, valsize); return (0); } /* * Kernel version of setsockopt(2). * * XXX: optlen is size_t, not socklen_t */ int so_setsockopt(struct socket *so, int level, int optname, void *optval, size_t optlen) { struct sockopt sopt; sopt.sopt_level = level; sopt.sopt_name = optname; sopt.sopt_dir = SOPT_SET; sopt.sopt_val = optval; sopt.sopt_valsize = optlen; sopt.sopt_td = NULL; return (sosetopt(so, &sopt)); } int sosetopt(struct socket *so, struct sockopt *sopt) { int error, optval; struct linger l; struct timeval tv; sbintime_t val; uint32_t val32; #ifdef MAC struct mac extmac; #endif CURVNET_SET(so->so_vnet); error = 0; if (sopt->sopt_level != SOL_SOCKET) { if (so->so_proto->pr_ctloutput != NULL) { error = (*so->so_proto->pr_ctloutput)(so, sopt); CURVNET_RESTORE(); return (error); } error = ENOPROTOOPT; } else { switch (sopt->sopt_name) { case SO_ACCEPTFILTER: error = accept_filt_setopt(so, sopt); if (error) goto bad; break; case SO_LINGER: error = sooptcopyin(sopt, &l, sizeof l, sizeof l); if (error) goto bad; SOCK_LOCK(so); so->so_linger = l.l_linger; if (l.l_onoff) so->so_options |= SO_LINGER; else so->so_options &= ~SO_LINGER; SOCK_UNLOCK(so); break; case SO_DEBUG: case SO_KEEPALIVE: case SO_DONTROUTE: case SO_USELOOPBACK: case SO_BROADCAST: case SO_REUSEADDR: case SO_REUSEPORT: case SO_OOBINLINE: case SO_TIMESTAMP: case SO_BINTIME: case SO_NOSIGPIPE: case SO_NO_DDP: case SO_NO_OFFLOAD: error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) goto bad; SOCK_LOCK(so); if (optval) so->so_options |= sopt->sopt_name; else so->so_options &= ~sopt->sopt_name; SOCK_UNLOCK(so); break; case SO_SETFIB: error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) goto bad; if (optval < 0 || optval >= rt_numfibs) { error = EINVAL; goto bad; } if (((so->so_proto->pr_domain->dom_family == PF_INET) || (so->so_proto->pr_domain->dom_family == PF_INET6) || (so->so_proto->pr_domain->dom_family == PF_ROUTE))) so->so_fibnum = optval; else so->so_fibnum = 0; break; case SO_USER_COOKIE: error = sooptcopyin(sopt, &val32, sizeof val32, sizeof val32); if (error) goto bad; so->so_user_cookie = val32; break; case SO_SNDBUF: case SO_RCVBUF: case SO_SNDLOWAT: case SO_RCVLOWAT: error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) goto bad; /* * Values < 1 make no sense for any of these options, * so disallow them. */ if (optval < 1) { error = EINVAL; goto bad; } switch (sopt->sopt_name) { case SO_SNDBUF: case SO_RCVBUF: if (sbreserve(sopt->sopt_name == SO_SNDBUF ? &so->so_snd : &so->so_rcv, (u_long)optval, so, curthread) == 0) { error = ENOBUFS; goto bad; } (sopt->sopt_name == SO_SNDBUF ? &so->so_snd : &so->so_rcv)->sb_flags &= ~SB_AUTOSIZE; break; /* * Make sure the low-water is never greater than the * high-water. */ case SO_SNDLOWAT: SOCKBUF_LOCK(&so->so_snd); so->so_snd.sb_lowat = (optval > so->so_snd.sb_hiwat) ? so->so_snd.sb_hiwat : optval; SOCKBUF_UNLOCK(&so->so_snd); break; case SO_RCVLOWAT: SOCKBUF_LOCK(&so->so_rcv); so->so_rcv.sb_lowat = (optval > so->so_rcv.sb_hiwat) ? so->so_rcv.sb_hiwat : optval; SOCKBUF_UNLOCK(&so->so_rcv); break; } break; case SO_SNDTIMEO: case SO_RCVTIMEO: #ifdef COMPAT_FREEBSD32 if (SV_CURPROC_FLAG(SV_ILP32)) { struct timeval32 tv32; error = sooptcopyin(sopt, &tv32, sizeof tv32, sizeof tv32); CP(tv32, tv, tv_sec); CP(tv32, tv, tv_usec); } else #endif error = sooptcopyin(sopt, &tv, sizeof tv, sizeof tv); if (error) goto bad; if (tv.tv_sec < 0 || tv.tv_usec < 0 || tv.tv_usec >= 1000000) { error = EDOM; goto bad; } if (tv.tv_sec > INT32_MAX) val = SBT_MAX; else val = tvtosbt(tv); switch (sopt->sopt_name) { case SO_SNDTIMEO: so->so_snd.sb_timeo = val; break; case SO_RCVTIMEO: so->so_rcv.sb_timeo = val; break; } break; case SO_LABEL: #ifdef MAC error = sooptcopyin(sopt, &extmac, sizeof extmac, sizeof extmac); if (error) goto bad; error = mac_setsockopt_label(sopt->sopt_td->td_ucred, so, &extmac); #else error = EOPNOTSUPP; #endif break; case SO_TS_CLOCK: error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) goto bad; if (optval < 0 || optval > SO_TS_CLOCK_MAX) { error = EINVAL; goto bad; } so->so_ts_clock = optval; break; case SO_MAX_PACING_RATE: error = sooptcopyin(sopt, &val32, sizeof(val32), sizeof(val32)); if (error) goto bad; so->so_max_pacing_rate = val32; break; default: if (V_socket_hhh[HHOOK_SOCKET_OPT]->hhh_nhooks > 0) error = hhook_run_socket(so, sopt, HHOOK_SOCKET_OPT); else error = ENOPROTOOPT; break; } if (error == 0 && so->so_proto->pr_ctloutput != NULL) (void)(*so->so_proto->pr_ctloutput)(so, sopt); } bad: CURVNET_RESTORE(); return (error); } /* * Helper routine for getsockopt. */ int sooptcopyout(struct sockopt *sopt, const void *buf, size_t len) { int error; size_t valsize; error = 0; /* * Documented get behavior is that we always return a value, possibly * truncated to fit in the user's buffer. Traditional behavior is * that we always tell the user precisely how much we copied, rather * than something useful like the total amount we had available for * her. Note that this interface is not idempotent; the entire * answer must be generated ahead of time. */ valsize = min(len, sopt->sopt_valsize); sopt->sopt_valsize = valsize; if (sopt->sopt_val != NULL) { if (sopt->sopt_td != NULL) error = copyout(buf, sopt->sopt_val, valsize); else bcopy(buf, sopt->sopt_val, valsize); } return (error); } int sogetopt(struct socket *so, struct sockopt *sopt) { int error, optval; struct linger l; struct timeval tv; #ifdef MAC struct mac extmac; #endif CURVNET_SET(so->so_vnet); error = 0; if (sopt->sopt_level != SOL_SOCKET) { if (so->so_proto->pr_ctloutput != NULL) error = (*so->so_proto->pr_ctloutput)(so, sopt); else error = ENOPROTOOPT; CURVNET_RESTORE(); return (error); } else { switch (sopt->sopt_name) { case SO_ACCEPTFILTER: error = accept_filt_getopt(so, sopt); break; case SO_LINGER: SOCK_LOCK(so); l.l_onoff = so->so_options & SO_LINGER; l.l_linger = so->so_linger; SOCK_UNLOCK(so); error = sooptcopyout(sopt, &l, sizeof l); break; case SO_USELOOPBACK: case SO_DONTROUTE: case SO_DEBUG: case SO_KEEPALIVE: case SO_REUSEADDR: case SO_REUSEPORT: case SO_BROADCAST: case SO_OOBINLINE: case SO_ACCEPTCONN: case SO_TIMESTAMP: case SO_BINTIME: case SO_NOSIGPIPE: optval = so->so_options & sopt->sopt_name; integer: error = sooptcopyout(sopt, &optval, sizeof optval); break; case SO_TYPE: optval = so->so_type; goto integer; case SO_PROTOCOL: optval = so->so_proto->pr_protocol; goto integer; case SO_ERROR: SOCK_LOCK(so); optval = so->so_error; so->so_error = 0; SOCK_UNLOCK(so); goto integer; case SO_SNDBUF: optval = so->so_snd.sb_hiwat; goto integer; case SO_RCVBUF: optval = so->so_rcv.sb_hiwat; goto integer; case SO_SNDLOWAT: optval = so->so_snd.sb_lowat; goto integer; case SO_RCVLOWAT: optval = so->so_rcv.sb_lowat; goto integer; case SO_SNDTIMEO: case SO_RCVTIMEO: tv = sbttotv(sopt->sopt_name == SO_SNDTIMEO ? so->so_snd.sb_timeo : so->so_rcv.sb_timeo); #ifdef COMPAT_FREEBSD32 if (SV_CURPROC_FLAG(SV_ILP32)) { struct timeval32 tv32; CP(tv, tv32, tv_sec); CP(tv, tv32, tv_usec); error = sooptcopyout(sopt, &tv32, sizeof tv32); } else #endif error = sooptcopyout(sopt, &tv, sizeof tv); break; case SO_LABEL: #ifdef MAC error = sooptcopyin(sopt, &extmac, sizeof(extmac), sizeof(extmac)); if (error) goto bad; error = mac_getsockopt_label(sopt->sopt_td->td_ucred, so, &extmac); if (error) goto bad; error = sooptcopyout(sopt, &extmac, sizeof extmac); #else error = EOPNOTSUPP; #endif break; case SO_PEERLABEL: #ifdef MAC error = sooptcopyin(sopt, &extmac, sizeof(extmac), sizeof(extmac)); if (error) goto bad; error = mac_getsockopt_peerlabel( sopt->sopt_td->td_ucred, so, &extmac); if (error) goto bad; error = sooptcopyout(sopt, &extmac, sizeof extmac); #else error = EOPNOTSUPP; #endif break; case SO_LISTENQLIMIT: - optval = so->so_qlimit; + optval = SOLISTENING(so) ? so->sol_qlimit : 0; goto integer; case SO_LISTENQLEN: - optval = so->so_qlen; + optval = SOLISTENING(so) ? so->sol_qlen : 0; goto integer; case SO_LISTENINCQLEN: - optval = so->so_incqlen; + optval = SOLISTENING(so) ? so->sol_incqlen : 0; goto integer; case SO_TS_CLOCK: optval = so->so_ts_clock; goto integer; case SO_MAX_PACING_RATE: optval = so->so_max_pacing_rate; goto integer; default: if (V_socket_hhh[HHOOK_SOCKET_OPT]->hhh_nhooks > 0) error = hhook_run_socket(so, sopt, HHOOK_SOCKET_OPT); else error = ENOPROTOOPT; break; } } #ifdef MAC bad: #endif CURVNET_RESTORE(); return (error); } int soopt_getm(struct sockopt *sopt, struct mbuf **mp) { struct mbuf *m, *m_prev; int sopt_size = sopt->sopt_valsize; MGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT, MT_DATA); if (m == NULL) return ENOBUFS; if (sopt_size > MLEN) { MCLGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT); if ((m->m_flags & M_EXT) == 0) { m_free(m); return ENOBUFS; } m->m_len = min(MCLBYTES, sopt_size); } else { m->m_len = min(MLEN, sopt_size); } sopt_size -= m->m_len; *mp = m; m_prev = m; while (sopt_size) { MGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT, MT_DATA); if (m == NULL) { m_freem(*mp); return ENOBUFS; } if (sopt_size > MLEN) { MCLGET(m, sopt->sopt_td != NULL ? M_WAITOK : M_NOWAIT); if ((m->m_flags & M_EXT) == 0) { m_freem(m); m_freem(*mp); return ENOBUFS; } m->m_len = min(MCLBYTES, sopt_size); } else { m->m_len = min(MLEN, sopt_size); } sopt_size -= m->m_len; m_prev->m_next = m; m_prev = m; } return (0); } int soopt_mcopyin(struct sockopt *sopt, struct mbuf *m) { struct mbuf *m0 = m; if (sopt->sopt_val == NULL) return (0); while (m != NULL && sopt->sopt_valsize >= m->m_len) { if (sopt->sopt_td != NULL) { int error; error = copyin(sopt->sopt_val, mtod(m, char *), m->m_len); if (error != 0) { m_freem(m0); return(error); } } else bcopy(sopt->sopt_val, mtod(m, char *), m->m_len); sopt->sopt_valsize -= m->m_len; sopt->sopt_val = (char *)sopt->sopt_val + m->m_len; m = m->m_next; } if (m != NULL) /* should be allocated enoughly at ip6_sooptmcopyin() */ panic("ip6_sooptmcopyin"); return (0); } int soopt_mcopyout(struct sockopt *sopt, struct mbuf *m) { struct mbuf *m0 = m; size_t valsize = 0; if (sopt->sopt_val == NULL) return (0); while (m != NULL && sopt->sopt_valsize >= m->m_len) { if (sopt->sopt_td != NULL) { int error; error = copyout(mtod(m, char *), sopt->sopt_val, m->m_len); if (error != 0) { m_freem(m0); return(error); } } else bcopy(mtod(m, char *), sopt->sopt_val, m->m_len); sopt->sopt_valsize -= m->m_len; sopt->sopt_val = (char *)sopt->sopt_val + m->m_len; valsize += m->m_len; m = m->m_next; } if (m != NULL) { /* enough soopt buffer should be given from user-land */ m_freem(m0); return(EINVAL); } sopt->sopt_valsize = valsize; return (0); } /* * sohasoutofband(): protocol notifies socket layer of the arrival of new * out-of-band data, which will then notify socket consumers. */ void sohasoutofband(struct socket *so) { if (so->so_sigio != NULL) pgsigio(&so->so_sigio, SIGURG, 0); - selwakeuppri(&so->so_rcv.sb_sel, PSOCK); + selwakeuppri(&so->so_rdsel, PSOCK); } int sopoll(struct socket *so, int events, struct ucred *active_cred, struct thread *td) { /* * We do not need to set or assert curvnet as long as everyone uses * sopoll_generic(). */ return (so->so_proto->pr_usrreqs->pru_sopoll(so, events, active_cred, td)); } int sopoll_generic(struct socket *so, int events, struct ucred *active_cred, struct thread *td) { - int revents = 0; + int revents; - SOCKBUF_LOCK(&so->so_snd); - SOCKBUF_LOCK(&so->so_rcv); - if (events & (POLLIN | POLLRDNORM)) - if (soreadabledata(so)) - revents |= events & (POLLIN | POLLRDNORM); - - if (events & (POLLOUT | POLLWRNORM)) - if (sowriteable(so)) - revents |= events & (POLLOUT | POLLWRNORM); - - if (events & (POLLPRI | POLLRDBAND)) - if (so->so_oobmark || (so->so_rcv.sb_state & SBS_RCVATMARK)) - revents |= events & (POLLPRI | POLLRDBAND); - - if ((events & POLLINIGNEOF) == 0) { - if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { - revents |= events & (POLLIN | POLLRDNORM); - if (so->so_snd.sb_state & SBS_CANTSENDMORE) - revents |= POLLHUP; + SOCK_LOCK(so); + if (SOLISTENING(so)) { + if (!(events & (POLLIN | POLLRDNORM))) + revents = 0; + else if (!TAILQ_EMPTY(&so->sol_comp)) + revents = events & (POLLIN | POLLRDNORM); + else { + selrecord(td, &so->so_rdsel); + revents = 0; } - } - - if (revents == 0) { - if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) { - selrecord(td, &so->so_rcv.sb_sel); - so->so_rcv.sb_flags |= SB_SEL; + } else { + revents = 0; + SOCKBUF_LOCK(&so->so_snd); + SOCKBUF_LOCK(&so->so_rcv); + if (events & (POLLIN | POLLRDNORM)) + if (soreadabledata(so)) + revents |= events & (POLLIN | POLLRDNORM); + if (events & (POLLOUT | POLLWRNORM)) + if (sowriteable(so)) + revents |= events & (POLLOUT | POLLWRNORM); + if (events & (POLLPRI | POLLRDBAND)) + if (so->so_oobmark || + (so->so_rcv.sb_state & SBS_RCVATMARK)) + revents |= events & (POLLPRI | POLLRDBAND); + if ((events & POLLINIGNEOF) == 0) { + if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { + revents |= events & (POLLIN | POLLRDNORM); + if (so->so_snd.sb_state & SBS_CANTSENDMORE) + revents |= POLLHUP; + } } - - if (events & (POLLOUT | POLLWRNORM)) { - selrecord(td, &so->so_snd.sb_sel); - so->so_snd.sb_flags |= SB_SEL; + if (revents == 0) { + if (events & + (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) { + selrecord(td, &so->so_rdsel); + so->so_rcv.sb_flags |= SB_SEL; + } + if (events & (POLLOUT | POLLWRNORM)) { + selrecord(td, &so->so_wrsel); + so->so_snd.sb_flags |= SB_SEL; + } } + SOCKBUF_UNLOCK(&so->so_rcv); + SOCKBUF_UNLOCK(&so->so_snd); } - - SOCKBUF_UNLOCK(&so->so_rcv); - SOCKBUF_UNLOCK(&so->so_snd); + SOCK_UNLOCK(so); return (revents); } int soo_kqfilter(struct file *fp, struct knote *kn) { struct socket *so = kn->kn_fp->f_data; struct sockbuf *sb; + struct knlist *knl; switch (kn->kn_filter) { case EVFILT_READ: kn->kn_fop = &soread_filtops; + knl = &so->so_rdsel.si_note; sb = &so->so_rcv; break; case EVFILT_WRITE: kn->kn_fop = &sowrite_filtops; + knl = &so->so_wrsel.si_note; sb = &so->so_snd; break; case EVFILT_EMPTY: kn->kn_fop = &soempty_filtops; + knl = &so->so_wrsel.si_note; sb = &so->so_snd; break; default: return (EINVAL); } - SOCKBUF_LOCK(sb); - knlist_add(&sb->sb_sel.si_note, kn, 1); - sb->sb_flags |= SB_KNOTE; - SOCKBUF_UNLOCK(sb); + SOCK_LOCK(so); + if (SOLISTENING(so)) { + knlist_add(knl, kn, 1); + } else { + SOCKBUF_LOCK(sb); + knlist_add(knl, kn, 1); + sb->sb_flags |= SB_KNOTE; + SOCKBUF_UNLOCK(sb); + } + SOCK_UNLOCK(so); return (0); } /* * Some routines that return EOPNOTSUPP for entry points that are not * supported by a protocol. Fill in as needed. */ int pru_accept_notsupp(struct socket *so, struct sockaddr **nam) { return EOPNOTSUPP; } int pru_aio_queue_notsupp(struct socket *so, struct kaiocb *job) { return EOPNOTSUPP; } int pru_attach_notsupp(struct socket *so, int proto, struct thread *td) { return EOPNOTSUPP; } int pru_bind_notsupp(struct socket *so, struct sockaddr *nam, struct thread *td) { return EOPNOTSUPP; } int pru_bindat_notsupp(int fd, struct socket *so, struct sockaddr *nam, struct thread *td) { return EOPNOTSUPP; } int pru_connect_notsupp(struct socket *so, struct sockaddr *nam, struct thread *td) { return EOPNOTSUPP; } int pru_connectat_notsupp(int fd, struct socket *so, struct sockaddr *nam, struct thread *td) { return EOPNOTSUPP; } int pru_connect2_notsupp(struct socket *so1, struct socket *so2) { return EOPNOTSUPP; } int pru_control_notsupp(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, struct thread *td) { return EOPNOTSUPP; } int pru_disconnect_notsupp(struct socket *so) { return EOPNOTSUPP; } int pru_listen_notsupp(struct socket *so, int backlog, struct thread *td) { return EOPNOTSUPP; } int pru_peeraddr_notsupp(struct socket *so, struct sockaddr **nam) { return EOPNOTSUPP; } int pru_rcvd_notsupp(struct socket *so, int flags) { return EOPNOTSUPP; } int pru_rcvoob_notsupp(struct socket *so, struct mbuf *m, int flags) { return EOPNOTSUPP; } int pru_send_notsupp(struct socket *so, int flags, struct mbuf *m, struct sockaddr *addr, struct mbuf *control, struct thread *td) { return EOPNOTSUPP; } int pru_ready_notsupp(struct socket *so, struct mbuf *m, int count) { return (EOPNOTSUPP); } /* * This isn't really a ``null'' operation, but it's the default one and * doesn't do anything destructive. */ int pru_sense_null(struct socket *so, struct stat *sb) { sb->st_blksize = so->so_snd.sb_hiwat; return 0; } int pru_shutdown_notsupp(struct socket *so) { return EOPNOTSUPP; } int pru_sockaddr_notsupp(struct socket *so, struct sockaddr **nam) { return EOPNOTSUPP; } int pru_sosend_notsupp(struct socket *so, struct sockaddr *addr, struct uio *uio, struct mbuf *top, struct mbuf *control, int flags, struct thread *td) { return EOPNOTSUPP; } int pru_soreceive_notsupp(struct socket *so, struct sockaddr **paddr, struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp) { return EOPNOTSUPP; } int pru_sopoll_notsupp(struct socket *so, int events, struct ucred *cred, struct thread *td) { return EOPNOTSUPP; } static void filt_sordetach(struct knote *kn) { struct socket *so = kn->kn_fp->f_data; - SOCKBUF_LOCK(&so->so_rcv); - knlist_remove(&so->so_rcv.sb_sel.si_note, kn, 1); - if (knlist_empty(&so->so_rcv.sb_sel.si_note)) + so_rdknl_lock(so); + knlist_remove(&so->so_rdsel.si_note, kn, 1); + if (!SOLISTENING(so) && knlist_empty(&so->so_rdsel.si_note)) so->so_rcv.sb_flags &= ~SB_KNOTE; - SOCKBUF_UNLOCK(&so->so_rcv); + so_rdknl_unlock(so); } /*ARGSUSED*/ static int filt_soread(struct knote *kn, long hint) { struct socket *so; so = kn->kn_fp->f_data; - if (so->so_options & SO_ACCEPTCONN) { - kn->kn_data = so->so_qlen; - return (!TAILQ_EMPTY(&so->so_comp)); + if (SOLISTENING(so)) { + SOCK_LOCK_ASSERT(so); + kn->kn_data = so->sol_qlen; + return (!TAILQ_EMPTY(&so->sol_comp)); } + SOCKBUF_LOCK_ASSERT(&so->so_rcv); kn->kn_data = sbavail(&so->so_rcv) - so->so_rcv.sb_ctl; if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { kn->kn_flags |= EV_EOF; kn->kn_fflags = so->so_error; return (1); } else if (so->so_error) /* temporary udp error */ return (1); if (kn->kn_sfflags & NOTE_LOWAT) { if (kn->kn_data >= kn->kn_sdata) return (1); } else if (sbavail(&so->so_rcv) >= so->so_rcv.sb_lowat) return (1); /* This hook returning non-zero indicates an event, not error */ return (hhook_run_socket(so, NULL, HHOOK_FILT_SOREAD)); } static void filt_sowdetach(struct knote *kn) { struct socket *so = kn->kn_fp->f_data; - SOCKBUF_LOCK(&so->so_snd); - knlist_remove(&so->so_snd.sb_sel.si_note, kn, 1); - if (knlist_empty(&so->so_snd.sb_sel.si_note)) + so_wrknl_lock(so); + knlist_remove(&so->so_wrsel.si_note, kn, 1); + if (!SOLISTENING(so) && knlist_empty(&so->so_wrsel.si_note)) so->so_snd.sb_flags &= ~SB_KNOTE; - SOCKBUF_UNLOCK(&so->so_snd); + so_wrknl_unlock(so); } /*ARGSUSED*/ static int filt_sowrite(struct knote *kn, long hint) { struct socket *so; so = kn->kn_fp->f_data; + + if (SOLISTENING(so)) + return (0); + SOCKBUF_LOCK_ASSERT(&so->so_snd); kn->kn_data = sbspace(&so->so_snd); hhook_run_socket(so, kn, HHOOK_FILT_SOWRITE); if (so->so_snd.sb_state & SBS_CANTSENDMORE) { kn->kn_flags |= EV_EOF; kn->kn_fflags = so->so_error; return (1); } else if (so->so_error) /* temporary udp error */ return (1); else if (((so->so_state & SS_ISCONNECTED) == 0) && (so->so_proto->pr_flags & PR_CONNREQUIRED)) return (0); else if (kn->kn_sfflags & NOTE_LOWAT) return (kn->kn_data >= kn->kn_sdata); else return (kn->kn_data >= so->so_snd.sb_lowat); } static int filt_soempty(struct knote *kn, long hint) { struct socket *so; so = kn->kn_fp->f_data; + + if (SOLISTENING(so)) + return (1); + SOCKBUF_LOCK_ASSERT(&so->so_snd); kn->kn_data = sbused(&so->so_snd); if (kn->kn_data == 0) return (1); else return (0); } int socheckuid(struct socket *so, uid_t uid) { if (so == NULL) return (EPERM); if (so->so_cred->cr_uid != uid) return (EPERM); return (0); } /* * These functions are used by protocols to notify the socket layer (and its * consumers) of state changes in the sockets driven by protocol-side events. */ /* * Procedures to manipulate state flags of socket and do appropriate wakeups. * * Normal sequence from the active (originating) side is that * soisconnecting() is called during processing of connect() call, resulting * in an eventual call to soisconnected() if/when the connection is * established. When the connection is torn down soisdisconnecting() is * called during processing of disconnect() call, and soisdisconnected() is * called when the connection to the peer is totally severed. The semantics * of these routines are such that connectionless protocols can call * soisconnected() and soisdisconnected() only, bypassing the in-progress * calls when setting up a ``connection'' takes no time. * * From the passive side, a socket is created with two queues of sockets: * so_incomp for connections in progress and so_comp for connections already * made and awaiting user acceptance. As a protocol is preparing incoming * connections, it creates a socket structure queued on so_incomp by calling * sonewconn(). When the connection is established, soisconnected() is * called, and transfers the socket structure to so_comp, making it available * to accept(). * * If a socket is closed with sockets on either so_incomp or so_comp, these * sockets are dropped. * * If higher-level protocols are implemented in the kernel, the wakeups done * here will sometimes cause software-interrupt process scheduling. */ void soisconnecting(struct socket *so) { SOCK_LOCK(so); so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING); so->so_state |= SS_ISCONNECTING; SOCK_UNLOCK(so); } void soisconnected(struct socket *so) { struct socket *head; int ret; + /* + * XXXGL: this is the only place where we acquire socket locks + * in reverse order: first child, then listening socket. To + * avoid possible LOR, use try semantics. + */ restart: - ACCEPT_LOCK(); SOCK_LOCK(so); + if ((head = so->so_listen) != NULL && + __predict_false(SOLISTEN_TRYLOCK(head) == 0)) { + SOCK_UNLOCK(so); + goto restart; + } so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING|SS_ISCONFIRMING); so->so_state |= SS_ISCONNECTED; - head = so->so_head; - if (head != NULL && (so->so_qstate & SQ_INCOMP)) { + if (head != NULL && (so->so_qstate == SQ_INCOMP)) { +again: if ((so->so_options & SO_ACCEPTFILTER) == 0) { + TAILQ_REMOVE(&head->sol_incomp, so, so_list); + head->sol_incqlen--; + TAILQ_INSERT_TAIL(&head->sol_comp, so, so_list); + head->sol_qlen++; + so->so_qstate = SQ_COMP; SOCK_UNLOCK(so); - TAILQ_REMOVE(&head->so_incomp, so, so_list); - head->so_incqlen--; - so->so_qstate &= ~SQ_INCOMP; - TAILQ_INSERT_TAIL(&head->so_comp, so, so_list); - head->so_qlen++; - so->so_qstate |= SQ_COMP; - ACCEPT_UNLOCK(); - sorwakeup(head); - wakeup_one(&head->so_timeo); + solisten_wakeup(head); /* unlocks */ } else { - ACCEPT_UNLOCK(); + SOCKBUF_LOCK(&so->so_rcv); soupcall_set(so, SO_RCV, - head->so_accf->so_accept_filter->accf_callback, - head->so_accf->so_accept_filter_arg); + head->sol_accept_filter->accf_callback, + head->sol_accept_filter_arg); so->so_options &= ~SO_ACCEPTFILTER; - ret = head->so_accf->so_accept_filter->accf_callback(so, - head->so_accf->so_accept_filter_arg, M_NOWAIT); - if (ret == SU_ISCONNECTED) + ret = head->sol_accept_filter->accf_callback(so, + head->sol_accept_filter_arg, M_NOWAIT); + if (ret == SU_ISCONNECTED) { soupcall_clear(so, SO_RCV); + SOCKBUF_UNLOCK(&so->so_rcv); + goto again; + } + SOCKBUF_UNLOCK(&so->so_rcv); SOCK_UNLOCK(so); - if (ret == SU_ISCONNECTED) - goto restart; + SOLISTEN_UNLOCK(head); } return; } + if (head != NULL) + SOLISTEN_UNLOCK(head); SOCK_UNLOCK(so); - ACCEPT_UNLOCK(); wakeup(&so->so_timeo); sorwakeup(so); sowwakeup(so); } void soisdisconnecting(struct socket *so) { - /* - * Note: This code assumes that SOCK_LOCK(so) and - * SOCKBUF_LOCK(&so->so_rcv) are the same. - */ - SOCKBUF_LOCK(&so->so_rcv); + SOCK_LOCK(so); so->so_state &= ~SS_ISCONNECTING; so->so_state |= SS_ISDISCONNECTING; - socantrcvmore_locked(so); - SOCKBUF_LOCK(&so->so_snd); - socantsendmore_locked(so); + + if (!SOLISTENING(so)) { + SOCKBUF_LOCK(&so->so_rcv); + socantrcvmore_locked(so); + SOCKBUF_LOCK(&so->so_snd); + socantsendmore_locked(so); + } + SOCK_UNLOCK(so); wakeup(&so->so_timeo); } void soisdisconnected(struct socket *so) { - /* - * Note: This code assumes that SOCK_LOCK(so) and - * SOCKBUF_LOCK(&so->so_rcv) are the same. - */ - SOCKBUF_LOCK(&so->so_rcv); + SOCK_LOCK(so); so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING); so->so_state |= SS_ISDISCONNECTED; - socantrcvmore_locked(so); - SOCKBUF_LOCK(&so->so_snd); - sbdrop_locked(&so->so_snd, sbused(&so->so_snd)); - socantsendmore_locked(so); + + if (!SOLISTENING(so)) { + SOCKBUF_LOCK(&so->so_rcv); + socantrcvmore_locked(so); + SOCKBUF_LOCK(&so->so_snd); + sbdrop_locked(&so->so_snd, sbused(&so->so_snd)); + socantsendmore_locked(so); + } + SOCK_UNLOCK(so); wakeup(&so->so_timeo); } /* * Make a copy of a sockaddr in a malloced buffer of type M_SONAME. */ struct sockaddr * sodupsockaddr(const struct sockaddr *sa, int mflags) { struct sockaddr *sa2; sa2 = malloc(sa->sa_len, M_SONAME, mflags); if (sa2) bcopy(sa, sa2, sa->sa_len); return sa2; } /* * Register per-socket buffer upcalls. */ void soupcall_set(struct socket *so, int which, so_upcall_t func, void *arg) { struct sockbuf *sb; + KASSERT(!SOLISTENING(so), ("%s: so %p listening", __func__, so)); + switch (which) { case SO_RCV: sb = &so->so_rcv; break; case SO_SND: sb = &so->so_snd; break; default: panic("soupcall_set: bad which"); } SOCKBUF_LOCK_ASSERT(sb); sb->sb_upcall = func; sb->sb_upcallarg = arg; sb->sb_flags |= SB_UPCALL; } void soupcall_clear(struct socket *so, int which) { struct sockbuf *sb; + KASSERT(!SOLISTENING(so), ("%s: so %p listening", __func__, so)); + switch (which) { case SO_RCV: sb = &so->so_rcv; break; case SO_SND: sb = &so->so_snd; break; default: panic("soupcall_clear: bad which"); } SOCKBUF_LOCK_ASSERT(sb); - KASSERT(sb->sb_upcall != NULL, ("soupcall_clear: no upcall to clear")); + KASSERT(sb->sb_upcall != NULL, + ("%s: so %p no upcall to clear", __func__, so)); sb->sb_upcall = NULL; sb->sb_upcallarg = NULL; sb->sb_flags &= ~SB_UPCALL; } +void +solisten_upcall_set(struct socket *so, so_upcall_t func, void *arg) +{ + + SOLISTEN_LOCK_ASSERT(so); + so->sol_upcall = func; + so->sol_upcallarg = arg; +} + +static void +so_rdknl_lock(void *arg) +{ + struct socket *so = arg; + + if (SOLISTENING(so)) + SOCK_LOCK(so); + else + SOCKBUF_LOCK(&so->so_rcv); +} + +static void +so_rdknl_unlock(void *arg) +{ + struct socket *so = arg; + + if (SOLISTENING(so)) + SOCK_UNLOCK(so); + else + SOCKBUF_UNLOCK(&so->so_rcv); +} + +static void +so_rdknl_assert_locked(void *arg) +{ + struct socket *so = arg; + + if (SOLISTENING(so)) + SOCK_LOCK_ASSERT(so); + else + SOCKBUF_LOCK_ASSERT(&so->so_rcv); +} + +static void +so_rdknl_assert_unlocked(void *arg) +{ + struct socket *so = arg; + + if (SOLISTENING(so)) + SOCK_UNLOCK_ASSERT(so); + else + SOCKBUF_UNLOCK_ASSERT(&so->so_rcv); +} + +static void +so_wrknl_lock(void *arg) +{ + struct socket *so = arg; + + if (SOLISTENING(so)) + SOCK_LOCK(so); + else + SOCKBUF_LOCK(&so->so_snd); +} + +static void +so_wrknl_unlock(void *arg) +{ + struct socket *so = arg; + + if (SOLISTENING(so)) + SOCK_UNLOCK(so); + else + SOCKBUF_UNLOCK(&so->so_snd); +} + +static void +so_wrknl_assert_locked(void *arg) +{ + struct socket *so = arg; + + if (SOLISTENING(so)) + SOCK_LOCK_ASSERT(so); + else + SOCKBUF_LOCK_ASSERT(&so->so_snd); +} + +static void +so_wrknl_assert_unlocked(void *arg) +{ + struct socket *so = arg; + + if (SOLISTENING(so)) + SOCK_UNLOCK_ASSERT(so); + else + SOCKBUF_UNLOCK_ASSERT(&so->so_snd); +} + /* * Create an external-format (``xsocket'') structure using the information in * the kernel-format socket structure pointed to by so. This is done to * reduce the spew of irrelevant information over this interface, to isolate * user code from changes in the kernel structure, and potentially to provide * information-hiding if we decide that some of this information should be * hidden from users. */ void sotoxsocket(struct socket *so, struct xsocket *xso) { xso->xso_len = sizeof *xso; xso->xso_so = so; xso->so_type = so->so_type; xso->so_options = so->so_options; xso->so_linger = so->so_linger; xso->so_state = so->so_state; xso->so_pcb = so->so_pcb; xso->xso_protocol = so->so_proto->pr_protocol; xso->xso_family = so->so_proto->pr_domain->dom_family; - xso->so_qlen = so->so_qlen; - xso->so_incqlen = so->so_incqlen; - xso->so_qlimit = so->so_qlimit; xso->so_timeo = so->so_timeo; xso->so_error = so->so_error; - xso->so_pgid = so->so_sigio ? so->so_sigio->sio_pgid : 0; - xso->so_oobmark = so->so_oobmark; - sbtoxsockbuf(&so->so_snd, &xso->so_snd); - sbtoxsockbuf(&so->so_rcv, &xso->so_rcv); xso->so_uid = so->so_cred->cr_uid; -} - - -/* - * Socket accessor functions to provide external consumers with - * a safe interface to socket state - * - */ - -void -so_listeners_apply_all(struct socket *so, void (*func)(struct socket *, void *), - void *arg) -{ - - TAILQ_FOREACH(so, &so->so_comp, so_list) - func(so, arg); + xso->so_pgid = so->so_sigio ? so->so_sigio->sio_pgid : 0; + if (SOLISTENING(so)) { + xso->so_qlen = so->sol_qlen; + xso->so_incqlen = so->sol_incqlen; + xso->so_qlimit = so->sol_qlimit; + xso->so_oobmark = 0; + bzero(&xso->so_snd, sizeof(xso->so_snd)); + bzero(&xso->so_rcv, sizeof(xso->so_rcv)); + } else { + xso->so_state |= so->so_qstate; + xso->so_qlen = xso->so_incqlen = xso->so_qlimit = 0; + xso->so_oobmark = so->so_oobmark; + sbtoxsockbuf(&so->so_snd, &xso->so_snd); + sbtoxsockbuf(&so->so_rcv, &xso->so_rcv); + } } struct sockbuf * so_sockbuf_rcv(struct socket *so) { return (&so->so_rcv); } struct sockbuf * so_sockbuf_snd(struct socket *so) { return (&so->so_snd); } int so_state_get(const struct socket *so) { return (so->so_state); } void so_state_set(struct socket *so, int val) { so->so_state = val; } int so_options_get(const struct socket *so) { return (so->so_options); } void so_options_set(struct socket *so, int val) { so->so_options = val; } int so_error_get(const struct socket *so) { return (so->so_error); } void so_error_set(struct socket *so, int val) { so->so_error = val; } int so_linger_get(const struct socket *so) { return (so->so_linger); } void so_linger_set(struct socket *so, int val) { so->so_linger = val; } struct protosw * so_protosw_get(const struct socket *so) { return (so->so_proto); } void so_protosw_set(struct socket *so, struct protosw *val) { so->so_proto = val; } void so_sorwakeup(struct socket *so) { sorwakeup(so); } void so_sowwakeup(struct socket *so) { sowwakeup(so); } void so_sorwakeup_locked(struct socket *so) { sorwakeup_locked(so); } void so_sowwakeup_locked(struct socket *so) { sowwakeup_locked(so); } void so_lock(struct socket *so) { SOCK_LOCK(so); } void so_unlock(struct socket *so) { SOCK_UNLOCK(so); } Index: head/sys/kern/uipc_syscalls.c =================================================================== --- head/sys/kern/uipc_syscalls.c (revision 319721) +++ head/sys/kern/uipc_syscalls.c (revision 319722) @@ -1,1607 +1,1562 @@ /*- * Copyright (c) 1982, 1986, 1989, 1990, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)uipc_syscalls.c 8.4 (Berkeley) 2/21/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_capsicum.h" #include "opt_inet.h" #include "opt_inet6.h" #include "opt_compat.h" #include "opt_ktrace.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KTRACE #include #endif #ifdef COMPAT_FREEBSD32 #include #endif #include #include #include -/* - * Flags for accept1() and kern_accept4(), in addition to SOCK_CLOEXEC - * and SOCK_NONBLOCK. - */ -#define ACCEPT4_INHERIT 0x1 -#define ACCEPT4_COMPAT 0x2 - static int sendit(struct thread *td, int s, struct msghdr *mp, int flags); static int recvit(struct thread *td, int s, struct msghdr *mp, void *namelenp); static int accept1(struct thread *td, int s, struct sockaddr *uname, socklen_t *anamelen, int flags); static int getsockname1(struct thread *td, struct getsockname_args *uap, int compat); static int getpeername1(struct thread *td, struct getpeername_args *uap, int compat); static int sockargs(struct mbuf **, char *, socklen_t, int); /* * Convert a user file descriptor to a kernel file entry and check if required * capability rights are present. * If required copy of current set of capability rights is returned. * A reference on the file entry is held upon returning. */ int getsock_cap(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp, u_int *fflagp, struct filecaps *havecapsp) { struct file *fp; int error; error = fget_cap(td, fd, rightsp, &fp, havecapsp); if (error != 0) return (error); if (fp->f_type != DTYPE_SOCKET) { fdrop(fp, td); if (havecapsp != NULL) filecaps_free(havecapsp); return (ENOTSOCK); } if (fflagp != NULL) *fflagp = fp->f_flag; *fpp = fp; return (0); } /* * System call interface to the socket abstraction. */ #if defined(COMPAT_43) #define COMPAT_OLDSOCK #endif int sys_socket(struct thread *td, struct socket_args *uap) { return (kern_socket(td, uap->domain, uap->type, uap->protocol)); } int kern_socket(struct thread *td, int domain, int type, int protocol) { struct socket *so; struct file *fp; int fd, error, oflag, fflag; AUDIT_ARG_SOCKET(domain, type, protocol); oflag = 0; fflag = 0; if ((type & SOCK_CLOEXEC) != 0) { type &= ~SOCK_CLOEXEC; oflag |= O_CLOEXEC; } if ((type & SOCK_NONBLOCK) != 0) { type &= ~SOCK_NONBLOCK; fflag |= FNONBLOCK; } #ifdef MAC error = mac_socket_check_create(td->td_ucred, domain, type, protocol); if (error != 0) return (error); #endif error = falloc(td, &fp, &fd, oflag); if (error != 0) return (error); /* An extra reference on `fp' has been held for us by falloc(). */ error = socreate(domain, &so, type, protocol, td->td_ucred, td); if (error != 0) { fdclose(td, fp, fd); } else { finit(fp, FREAD | FWRITE | fflag, DTYPE_SOCKET, so, &socketops); if ((fflag & FNONBLOCK) != 0) (void) fo_ioctl(fp, FIONBIO, &fflag, td->td_ucred, td); td->td_retval[0] = fd; } fdrop(fp, td); return (error); } int sys_bind(struct thread *td, struct bind_args *uap) { struct sockaddr *sa; int error; error = getsockaddr(&sa, uap->name, uap->namelen); if (error == 0) { error = kern_bindat(td, AT_FDCWD, uap->s, sa); free(sa, M_SONAME); } return (error); } int kern_bindat(struct thread *td, int dirfd, int fd, struct sockaddr *sa) { struct socket *so; struct file *fp; cap_rights_t rights; int error; AUDIT_ARG_FD(fd); AUDIT_ARG_SOCKADDR(td, dirfd, sa); error = getsock_cap(td, fd, cap_rights_init(&rights, CAP_BIND), &fp, NULL, NULL); if (error != 0) return (error); so = fp->f_data; #ifdef KTRACE if (KTRPOINT(td, KTR_STRUCT)) ktrsockaddr(sa); #endif #ifdef MAC error = mac_socket_check_bind(td->td_ucred, so, sa); if (error == 0) { #endif if (dirfd == AT_FDCWD) error = sobind(so, sa, td); else error = sobindat(dirfd, so, sa, td); #ifdef MAC } #endif fdrop(fp, td); return (error); } int sys_bindat(struct thread *td, struct bindat_args *uap) { struct sockaddr *sa; int error; error = getsockaddr(&sa, uap->name, uap->namelen); if (error == 0) { error = kern_bindat(td, uap->fd, uap->s, sa); free(sa, M_SONAME); } return (error); } int sys_listen(struct thread *td, struct listen_args *uap) { return (kern_listen(td, uap->s, uap->backlog)); } int kern_listen(struct thread *td, int s, int backlog) { struct socket *so; struct file *fp; cap_rights_t rights; int error; AUDIT_ARG_FD(s); error = getsock_cap(td, s, cap_rights_init(&rights, CAP_LISTEN), &fp, NULL, NULL); if (error == 0) { so = fp->f_data; #ifdef MAC error = mac_socket_check_listen(td->td_ucred, so); if (error == 0) #endif error = solisten(so, backlog, td); fdrop(fp, td); } return (error); } /* * accept1() */ static int accept1(td, s, uname, anamelen, flags) struct thread *td; int s; struct sockaddr *uname; socklen_t *anamelen; int flags; { struct sockaddr *name; socklen_t namelen; struct file *fp; int error; if (uname == NULL) return (kern_accept4(td, s, NULL, NULL, flags, NULL)); error = copyin(anamelen, &namelen, sizeof (namelen)); if (error != 0) return (error); error = kern_accept4(td, s, &name, &namelen, flags, &fp); if (error != 0) return (error); if (error == 0 && uname != NULL) { #ifdef COMPAT_OLDSOCK if (flags & ACCEPT4_COMPAT) ((struct osockaddr *)name)->sa_family = name->sa_family; #endif error = copyout(name, uname, namelen); } if (error == 0) error = copyout(&namelen, anamelen, sizeof(namelen)); if (error != 0) fdclose(td, fp, td->td_retval[0]); fdrop(fp, td); free(name, M_SONAME); return (error); } int kern_accept(struct thread *td, int s, struct sockaddr **name, socklen_t *namelen, struct file **fp) { return (kern_accept4(td, s, name, namelen, ACCEPT4_INHERIT, fp)); } int kern_accept4(struct thread *td, int s, struct sockaddr **name, socklen_t *namelen, int flags, struct file **fp) { struct file *headfp, *nfp = NULL; struct sockaddr *sa = NULL; struct socket *head, *so; struct filecaps fcaps; cap_rights_t rights; u_int fflag; pid_t pgid; int error, fd, tmp; if (name != NULL) *name = NULL; AUDIT_ARG_FD(s); error = getsock_cap(td, s, cap_rights_init(&rights, CAP_ACCEPT), &headfp, &fflag, &fcaps); if (error != 0) return (error); head = headfp->f_data; if ((head->so_options & SO_ACCEPTCONN) == 0) { error = EINVAL; goto done; } #ifdef MAC error = mac_socket_check_accept(td->td_ucred, head); if (error != 0) goto done; #endif error = falloc_caps(td, &nfp, &fd, (flags & SOCK_CLOEXEC) ? O_CLOEXEC : 0, &fcaps); if (error != 0) goto done; - ACCEPT_LOCK(); - if ((head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->so_comp)) { - ACCEPT_UNLOCK(); - error = EWOULDBLOCK; + SOCK_LOCK(head); + if (!SOLISTENING(head)) { + SOCK_UNLOCK(head); + error = EINVAL; goto noconnection; } - while (TAILQ_EMPTY(&head->so_comp) && head->so_error == 0) { - if (head->so_rcv.sb_state & SBS_CANTRCVMORE) { - head->so_error = ECONNABORTED; - break; - } - error = msleep(&head->so_timeo, &accept_mtx, PSOCK | PCATCH, - "accept", 0); - if (error != 0) { - ACCEPT_UNLOCK(); - goto noconnection; - } - } - if (head->so_error) { - error = head->so_error; - head->so_error = 0; - ACCEPT_UNLOCK(); + + error = solisten_dequeue(head, &so, flags); + if (error != 0) goto noconnection; - } - so = TAILQ_FIRST(&head->so_comp); - KASSERT(!(so->so_qstate & SQ_INCOMP), ("accept1: so SQ_INCOMP")); - KASSERT(so->so_qstate & SQ_COMP, ("accept1: so not SQ_COMP")); - /* - * Before changing the flags on the socket, we have to bump the - * reference count. Otherwise, if the protocol calls sofree(), - * the socket will be released due to a zero refcount. - */ - SOCK_LOCK(so); /* soref() and so_state update */ - soref(so); /* file descriptor reference */ - - TAILQ_REMOVE(&head->so_comp, so, so_list); - head->so_qlen--; - if (flags & ACCEPT4_INHERIT) - so->so_state |= (head->so_state & SS_NBIO); - else - so->so_state |= (flags & SOCK_NONBLOCK) ? SS_NBIO : 0; - so->so_qstate &= ~SQ_COMP; - so->so_head = NULL; - - SOCK_UNLOCK(so); - ACCEPT_UNLOCK(); - /* An extra reference on `nfp' has been held for us by falloc(). */ td->td_retval[0] = fd; - /* connection has been removed from the listen queue */ - KNOTE_UNLOCKED(&head->so_rcv.sb_sel.si_note, 0); + /* Connection has been removed from the listen queue. */ + KNOTE_UNLOCKED(&head->so_rdsel.si_note, 0); if (flags & ACCEPT4_INHERIT) { pgid = fgetown(&head->so_sigio); if (pgid != 0) fsetown(pgid, &so->so_sigio); } else { fflag &= ~(FNONBLOCK | FASYNC); if (flags & SOCK_NONBLOCK) fflag |= FNONBLOCK; } finit(nfp, fflag, DTYPE_SOCKET, so, &socketops); /* Sync socket nonblocking/async state with file flags */ tmp = fflag & FNONBLOCK; (void) fo_ioctl(nfp, FIONBIO, &tmp, td->td_ucred, td); tmp = fflag & FASYNC; (void) fo_ioctl(nfp, FIOASYNC, &tmp, td->td_ucred, td); - sa = NULL; error = soaccept(so, &sa); if (error != 0) goto noconnection; if (sa == NULL) { if (name) *namelen = 0; goto done; } AUDIT_ARG_SOCKADDR(td, AT_FDCWD, sa); if (name) { /* check sa_len before it is destroyed */ if (*namelen > sa->sa_len) *namelen = sa->sa_len; #ifdef KTRACE if (KTRPOINT(td, KTR_STRUCT)) ktrsockaddr(sa); #endif *name = sa; sa = NULL; } noconnection: free(sa, M_SONAME); /* * close the new descriptor, assuming someone hasn't ripped it * out from under us. */ if (error != 0) fdclose(td, nfp, fd); /* * Release explicitly held references before returning. We return * a reference on nfp to the caller on success if they request it. */ done: if (nfp == NULL) filecaps_free(&fcaps); if (fp != NULL) { if (error == 0) { *fp = nfp; nfp = NULL; } else *fp = NULL; } if (nfp != NULL) fdrop(nfp, td); fdrop(headfp, td); return (error); } int sys_accept(td, uap) struct thread *td; struct accept_args *uap; { return (accept1(td, uap->s, uap->name, uap->anamelen, ACCEPT4_INHERIT)); } int sys_accept4(td, uap) struct thread *td; struct accept4_args *uap; { if (uap->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK)) return (EINVAL); return (accept1(td, uap->s, uap->name, uap->anamelen, uap->flags)); } #ifdef COMPAT_OLDSOCK int oaccept(td, uap) struct thread *td; struct accept_args *uap; { return (accept1(td, uap->s, uap->name, uap->anamelen, ACCEPT4_INHERIT | ACCEPT4_COMPAT)); } #endif /* COMPAT_OLDSOCK */ int sys_connect(struct thread *td, struct connect_args *uap) { struct sockaddr *sa; int error; error = getsockaddr(&sa, uap->name, uap->namelen); if (error == 0) { error = kern_connectat(td, AT_FDCWD, uap->s, sa); free(sa, M_SONAME); } return (error); } int kern_connectat(struct thread *td, int dirfd, int fd, struct sockaddr *sa) { struct socket *so; struct file *fp; cap_rights_t rights; int error, interrupted = 0; AUDIT_ARG_FD(fd); AUDIT_ARG_SOCKADDR(td, dirfd, sa); error = getsock_cap(td, fd, cap_rights_init(&rights, CAP_CONNECT), &fp, NULL, NULL); if (error != 0) return (error); so = fp->f_data; if (so->so_state & SS_ISCONNECTING) { error = EALREADY; goto done1; } #ifdef KTRACE if (KTRPOINT(td, KTR_STRUCT)) ktrsockaddr(sa); #endif #ifdef MAC error = mac_socket_check_connect(td->td_ucred, so, sa); if (error != 0) goto bad; #endif if (dirfd == AT_FDCWD) error = soconnect(so, sa, td); else error = soconnectat(dirfd, so, sa, td); if (error != 0) goto bad; if ((so->so_state & SS_NBIO) && (so->so_state & SS_ISCONNECTING)) { error = EINPROGRESS; goto done1; } SOCK_LOCK(so); while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) { - error = msleep(&so->so_timeo, SOCK_MTX(so), PSOCK | PCATCH, + error = msleep(&so->so_timeo, &so->so_lock, PSOCK | PCATCH, "connec", 0); if (error != 0) { if (error == EINTR || error == ERESTART) interrupted = 1; break; } } if (error == 0) { error = so->so_error; so->so_error = 0; } SOCK_UNLOCK(so); bad: if (!interrupted) so->so_state &= ~SS_ISCONNECTING; if (error == ERESTART) error = EINTR; done1: fdrop(fp, td); return (error); } int sys_connectat(struct thread *td, struct connectat_args *uap) { struct sockaddr *sa; int error; error = getsockaddr(&sa, uap->name, uap->namelen); if (error == 0) { error = kern_connectat(td, uap->fd, uap->s, sa); free(sa, M_SONAME); } return (error); } int kern_socketpair(struct thread *td, int domain, int type, int protocol, int *rsv) { struct file *fp1, *fp2; struct socket *so1, *so2; int fd, error, oflag, fflag; AUDIT_ARG_SOCKET(domain, type, protocol); oflag = 0; fflag = 0; if ((type & SOCK_CLOEXEC) != 0) { type &= ~SOCK_CLOEXEC; oflag |= O_CLOEXEC; } if ((type & SOCK_NONBLOCK) != 0) { type &= ~SOCK_NONBLOCK; fflag |= FNONBLOCK; } #ifdef MAC /* We might want to have a separate check for socket pairs. */ error = mac_socket_check_create(td->td_ucred, domain, type, protocol); if (error != 0) return (error); #endif error = socreate(domain, &so1, type, protocol, td->td_ucred, td); if (error != 0) return (error); error = socreate(domain, &so2, type, protocol, td->td_ucred, td); if (error != 0) goto free1; /* On success extra reference to `fp1' and 'fp2' is set by falloc. */ error = falloc(td, &fp1, &fd, oflag); if (error != 0) goto free2; rsv[0] = fd; fp1->f_data = so1; /* so1 already has ref count */ error = falloc(td, &fp2, &fd, oflag); if (error != 0) goto free3; fp2->f_data = so2; /* so2 already has ref count */ rsv[1] = fd; error = soconnect2(so1, so2); if (error != 0) goto free4; if (type == SOCK_DGRAM) { /* * Datagram socket connection is asymmetric. */ error = soconnect2(so2, so1); if (error != 0) goto free4; } finit(fp1, FREAD | FWRITE | fflag, DTYPE_SOCKET, fp1->f_data, &socketops); finit(fp2, FREAD | FWRITE | fflag, DTYPE_SOCKET, fp2->f_data, &socketops); if ((fflag & FNONBLOCK) != 0) { (void) fo_ioctl(fp1, FIONBIO, &fflag, td->td_ucred, td); (void) fo_ioctl(fp2, FIONBIO, &fflag, td->td_ucred, td); } fdrop(fp1, td); fdrop(fp2, td); return (0); free4: fdclose(td, fp2, rsv[1]); fdrop(fp2, td); free3: fdclose(td, fp1, rsv[0]); fdrop(fp1, td); free2: if (so2 != NULL) (void)soclose(so2); free1: if (so1 != NULL) (void)soclose(so1); return (error); } int sys_socketpair(struct thread *td, struct socketpair_args *uap) { int error, sv[2]; error = kern_socketpair(td, uap->domain, uap->type, uap->protocol, sv); if (error != 0) return (error); error = copyout(sv, uap->rsv, 2 * sizeof(int)); if (error != 0) { (void)kern_close(td, sv[0]); (void)kern_close(td, sv[1]); } return (error); } static int sendit(struct thread *td, int s, struct msghdr *mp, int flags) { struct mbuf *control; struct sockaddr *to; int error; #ifdef CAPABILITY_MODE if (IN_CAPABILITY_MODE(td) && (mp->msg_name != NULL)) return (ECAPMODE); #endif if (mp->msg_name != NULL) { error = getsockaddr(&to, mp->msg_name, mp->msg_namelen); if (error != 0) { to = NULL; goto bad; } mp->msg_name = to; } else { to = NULL; } if (mp->msg_control) { if (mp->msg_controllen < sizeof(struct cmsghdr) #ifdef COMPAT_OLDSOCK && mp->msg_flags != MSG_COMPAT #endif ) { error = EINVAL; goto bad; } error = sockargs(&control, mp->msg_control, mp->msg_controllen, MT_CONTROL); if (error != 0) goto bad; #ifdef COMPAT_OLDSOCK if (mp->msg_flags == MSG_COMPAT) { struct cmsghdr *cm; M_PREPEND(control, sizeof(*cm), M_WAITOK); cm = mtod(control, struct cmsghdr *); cm->cmsg_len = control->m_len; cm->cmsg_level = SOL_SOCKET; cm->cmsg_type = SCM_RIGHTS; } #endif } else { control = NULL; } error = kern_sendit(td, s, mp, flags, control, UIO_USERSPACE); bad: free(to, M_SONAME); return (error); } int kern_sendit(struct thread *td, int s, struct msghdr *mp, int flags, struct mbuf *control, enum uio_seg segflg) { struct file *fp; struct uio auio; struct iovec *iov; struct socket *so; cap_rights_t rights; #ifdef KTRACE struct uio *ktruio = NULL; #endif ssize_t len; int i, error; AUDIT_ARG_FD(s); cap_rights_init(&rights, CAP_SEND); if (mp->msg_name != NULL) { AUDIT_ARG_SOCKADDR(td, AT_FDCWD, mp->msg_name); cap_rights_set(&rights, CAP_CONNECT); } error = getsock_cap(td, s, &rights, &fp, NULL, NULL); if (error != 0) { m_freem(control); return (error); } so = (struct socket *)fp->f_data; #ifdef KTRACE if (mp->msg_name != NULL && KTRPOINT(td, KTR_STRUCT)) ktrsockaddr(mp->msg_name); #endif #ifdef MAC if (mp->msg_name != NULL) { error = mac_socket_check_connect(td->td_ucred, so, mp->msg_name); if (error != 0) { m_freem(control); goto bad; } } error = mac_socket_check_send(td->td_ucred, so); if (error != 0) { m_freem(control); goto bad; } #endif auio.uio_iov = mp->msg_iov; auio.uio_iovcnt = mp->msg_iovlen; auio.uio_segflg = segflg; auio.uio_rw = UIO_WRITE; auio.uio_td = td; auio.uio_offset = 0; /* XXX */ auio.uio_resid = 0; iov = mp->msg_iov; for (i = 0; i < mp->msg_iovlen; i++, iov++) { if ((auio.uio_resid += iov->iov_len) < 0) { error = EINVAL; m_freem(control); goto bad; } } #ifdef KTRACE if (KTRPOINT(td, KTR_GENIO)) ktruio = cloneuio(&auio); #endif len = auio.uio_resid; error = sosend(so, mp->msg_name, &auio, 0, control, flags, td); if (error != 0) { if (auio.uio_resid != len && (error == ERESTART || error == EINTR || error == EWOULDBLOCK)) error = 0; /* Generation of SIGPIPE can be controlled per socket */ if (error == EPIPE && !(so->so_options & SO_NOSIGPIPE) && !(flags & MSG_NOSIGNAL)) { PROC_LOCK(td->td_proc); tdsignal(td, SIGPIPE); PROC_UNLOCK(td->td_proc); } } if (error == 0) td->td_retval[0] = len - auio.uio_resid; #ifdef KTRACE if (ktruio != NULL) { ktruio->uio_resid = td->td_retval[0]; ktrgenio(s, UIO_WRITE, ktruio, error); } #endif bad: fdrop(fp, td); return (error); } int sys_sendto(struct thread *td, struct sendto_args *uap) { struct msghdr msg; struct iovec aiov; msg.msg_name = uap->to; msg.msg_namelen = uap->tolen; msg.msg_iov = &aiov; msg.msg_iovlen = 1; msg.msg_control = 0; #ifdef COMPAT_OLDSOCK msg.msg_flags = 0; #endif aiov.iov_base = uap->buf; aiov.iov_len = uap->len; return (sendit(td, uap->s, &msg, uap->flags)); } #ifdef COMPAT_OLDSOCK int osend(struct thread *td, struct osend_args *uap) { struct msghdr msg; struct iovec aiov; msg.msg_name = 0; msg.msg_namelen = 0; msg.msg_iov = &aiov; msg.msg_iovlen = 1; aiov.iov_base = uap->buf; aiov.iov_len = uap->len; msg.msg_control = 0; msg.msg_flags = 0; return (sendit(td, uap->s, &msg, uap->flags)); } int osendmsg(struct thread *td, struct osendmsg_args *uap) { struct msghdr msg; struct iovec *iov; int error; error = copyin(uap->msg, &msg, sizeof (struct omsghdr)); if (error != 0) return (error); error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE); if (error != 0) return (error); msg.msg_iov = iov; msg.msg_flags = MSG_COMPAT; error = sendit(td, uap->s, &msg, uap->flags); free(iov, M_IOV); return (error); } #endif int sys_sendmsg(struct thread *td, struct sendmsg_args *uap) { struct msghdr msg; struct iovec *iov; int error; error = copyin(uap->msg, &msg, sizeof (msg)); if (error != 0) return (error); error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE); if (error != 0) return (error); msg.msg_iov = iov; #ifdef COMPAT_OLDSOCK msg.msg_flags = 0; #endif error = sendit(td, uap->s, &msg, uap->flags); free(iov, M_IOV); return (error); } int kern_recvit(struct thread *td, int s, struct msghdr *mp, enum uio_seg fromseg, struct mbuf **controlp) { struct uio auio; struct iovec *iov; struct mbuf *m, *control = NULL; caddr_t ctlbuf; struct file *fp; struct socket *so; struct sockaddr *fromsa = NULL; cap_rights_t rights; #ifdef KTRACE struct uio *ktruio = NULL; #endif ssize_t len; int error, i; if (controlp != NULL) *controlp = NULL; AUDIT_ARG_FD(s); error = getsock_cap(td, s, cap_rights_init(&rights, CAP_RECV), &fp, NULL, NULL); if (error != 0) return (error); so = fp->f_data; #ifdef MAC error = mac_socket_check_receive(td->td_ucred, so); if (error != 0) { fdrop(fp, td); return (error); } #endif auio.uio_iov = mp->msg_iov; auio.uio_iovcnt = mp->msg_iovlen; auio.uio_segflg = UIO_USERSPACE; auio.uio_rw = UIO_READ; auio.uio_td = td; auio.uio_offset = 0; /* XXX */ auio.uio_resid = 0; iov = mp->msg_iov; for (i = 0; i < mp->msg_iovlen; i++, iov++) { if ((auio.uio_resid += iov->iov_len) < 0) { fdrop(fp, td); return (EINVAL); } } #ifdef KTRACE if (KTRPOINT(td, KTR_GENIO)) ktruio = cloneuio(&auio); #endif len = auio.uio_resid; error = soreceive(so, &fromsa, &auio, NULL, (mp->msg_control || controlp) ? &control : NULL, &mp->msg_flags); if (error != 0) { if (auio.uio_resid != len && (error == ERESTART || error == EINTR || error == EWOULDBLOCK)) error = 0; } if (fromsa != NULL) AUDIT_ARG_SOCKADDR(td, AT_FDCWD, fromsa); #ifdef KTRACE if (ktruio != NULL) { ktruio->uio_resid = len - auio.uio_resid; ktrgenio(s, UIO_READ, ktruio, error); } #endif if (error != 0) goto out; td->td_retval[0] = len - auio.uio_resid; if (mp->msg_name) { len = mp->msg_namelen; if (len <= 0 || fromsa == NULL) len = 0; else { /* save sa_len before it is destroyed by MSG_COMPAT */ len = MIN(len, fromsa->sa_len); #ifdef COMPAT_OLDSOCK if (mp->msg_flags & MSG_COMPAT) ((struct osockaddr *)fromsa)->sa_family = fromsa->sa_family; #endif if (fromseg == UIO_USERSPACE) { error = copyout(fromsa, mp->msg_name, (unsigned)len); if (error != 0) goto out; } else bcopy(fromsa, mp->msg_name, len); } mp->msg_namelen = len; } if (mp->msg_control && controlp == NULL) { #ifdef COMPAT_OLDSOCK /* * We assume that old recvmsg calls won't receive access * rights and other control info, esp. as control info * is always optional and those options didn't exist in 4.3. * If we receive rights, trim the cmsghdr; anything else * is tossed. */ if (control && mp->msg_flags & MSG_COMPAT) { if (mtod(control, struct cmsghdr *)->cmsg_level != SOL_SOCKET || mtod(control, struct cmsghdr *)->cmsg_type != SCM_RIGHTS) { mp->msg_controllen = 0; goto out; } control->m_len -= sizeof (struct cmsghdr); control->m_data += sizeof (struct cmsghdr); } #endif len = mp->msg_controllen; m = control; mp->msg_controllen = 0; ctlbuf = mp->msg_control; while (m && len > 0) { unsigned int tocopy; if (len >= m->m_len) tocopy = m->m_len; else { mp->msg_flags |= MSG_CTRUNC; tocopy = len; } if ((error = copyout(mtod(m, caddr_t), ctlbuf, tocopy)) != 0) goto out; ctlbuf += tocopy; len -= tocopy; m = m->m_next; } mp->msg_controllen = ctlbuf - (caddr_t)mp->msg_control; } out: fdrop(fp, td); #ifdef KTRACE if (fromsa && KTRPOINT(td, KTR_STRUCT)) ktrsockaddr(fromsa); #endif free(fromsa, M_SONAME); if (error == 0 && controlp != NULL) *controlp = control; else if (control) m_freem(control); return (error); } static int recvit(struct thread *td, int s, struct msghdr *mp, void *namelenp) { int error; error = kern_recvit(td, s, mp, UIO_USERSPACE, NULL); if (error != 0) return (error); if (namelenp != NULL) { error = copyout(&mp->msg_namelen, namelenp, sizeof (socklen_t)); #ifdef COMPAT_OLDSOCK if (mp->msg_flags & MSG_COMPAT) error = 0; /* old recvfrom didn't check */ #endif } return (error); } int sys_recvfrom(struct thread *td, struct recvfrom_args *uap) { struct msghdr msg; struct iovec aiov; int error; if (uap->fromlenaddr) { error = copyin(uap->fromlenaddr, &msg.msg_namelen, sizeof (msg.msg_namelen)); if (error != 0) goto done2; } else { msg.msg_namelen = 0; } msg.msg_name = uap->from; msg.msg_iov = &aiov; msg.msg_iovlen = 1; aiov.iov_base = uap->buf; aiov.iov_len = uap->len; msg.msg_control = 0; msg.msg_flags = uap->flags; error = recvit(td, uap->s, &msg, uap->fromlenaddr); done2: return (error); } #ifdef COMPAT_OLDSOCK int orecvfrom(struct thread *td, struct recvfrom_args *uap) { uap->flags |= MSG_COMPAT; return (sys_recvfrom(td, uap)); } #endif #ifdef COMPAT_OLDSOCK int orecv(struct thread *td, struct orecv_args *uap) { struct msghdr msg; struct iovec aiov; msg.msg_name = 0; msg.msg_namelen = 0; msg.msg_iov = &aiov; msg.msg_iovlen = 1; aiov.iov_base = uap->buf; aiov.iov_len = uap->len; msg.msg_control = 0; msg.msg_flags = uap->flags; return (recvit(td, uap->s, &msg, NULL)); } /* * Old recvmsg. This code takes advantage of the fact that the old msghdr * overlays the new one, missing only the flags, and with the (old) access * rights where the control fields are now. */ int orecvmsg(struct thread *td, struct orecvmsg_args *uap) { struct msghdr msg; struct iovec *iov; int error; error = copyin(uap->msg, &msg, sizeof (struct omsghdr)); if (error != 0) return (error); error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE); if (error != 0) return (error); msg.msg_flags = uap->flags | MSG_COMPAT; msg.msg_iov = iov; error = recvit(td, uap->s, &msg, &uap->msg->msg_namelen); if (msg.msg_controllen && error == 0) error = copyout(&msg.msg_controllen, &uap->msg->msg_accrightslen, sizeof (int)); free(iov, M_IOV); return (error); } #endif int sys_recvmsg(struct thread *td, struct recvmsg_args *uap) { struct msghdr msg; struct iovec *uiov, *iov; int error; error = copyin(uap->msg, &msg, sizeof (msg)); if (error != 0) return (error); error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE); if (error != 0) return (error); msg.msg_flags = uap->flags; #ifdef COMPAT_OLDSOCK msg.msg_flags &= ~MSG_COMPAT; #endif uiov = msg.msg_iov; msg.msg_iov = iov; error = recvit(td, uap->s, &msg, NULL); if (error == 0) { msg.msg_iov = uiov; error = copyout(&msg, uap->msg, sizeof(msg)); } free(iov, M_IOV); return (error); } int sys_shutdown(struct thread *td, struct shutdown_args *uap) { return (kern_shutdown(td, uap->s, uap->how)); } int kern_shutdown(struct thread *td, int s, int how) { struct socket *so; struct file *fp; cap_rights_t rights; int error; AUDIT_ARG_FD(s); error = getsock_cap(td, s, cap_rights_init(&rights, CAP_SHUTDOWN), &fp, NULL, NULL); if (error == 0) { so = fp->f_data; error = soshutdown(so, how); /* * Previous versions did not return ENOTCONN, but 0 in * case the socket was not connected. Some important * programs like syslogd up to r279016, 2015-02-19, * still depend on this behavior. */ if (error == ENOTCONN && td->td_proc->p_osrel < P_OSREL_SHUTDOWN_ENOTCONN) error = 0; fdrop(fp, td); } return (error); } int sys_setsockopt(struct thread *td, struct setsockopt_args *uap) { return (kern_setsockopt(td, uap->s, uap->level, uap->name, uap->val, UIO_USERSPACE, uap->valsize)); } int kern_setsockopt(struct thread *td, int s, int level, int name, void *val, enum uio_seg valseg, socklen_t valsize) { struct socket *so; struct file *fp; struct sockopt sopt; cap_rights_t rights; int error; if (val == NULL && valsize != 0) return (EFAULT); if ((int)valsize < 0) return (EINVAL); sopt.sopt_dir = SOPT_SET; sopt.sopt_level = level; sopt.sopt_name = name; sopt.sopt_val = val; sopt.sopt_valsize = valsize; switch (valseg) { case UIO_USERSPACE: sopt.sopt_td = td; break; case UIO_SYSSPACE: sopt.sopt_td = NULL; break; default: panic("kern_setsockopt called with bad valseg"); } AUDIT_ARG_FD(s); error = getsock_cap(td, s, cap_rights_init(&rights, CAP_SETSOCKOPT), &fp, NULL, NULL); if (error == 0) { so = fp->f_data; error = sosetopt(so, &sopt); fdrop(fp, td); } return(error); } int sys_getsockopt(struct thread *td, struct getsockopt_args *uap) { socklen_t valsize; int error; if (uap->val) { error = copyin(uap->avalsize, &valsize, sizeof (valsize)); if (error != 0) return (error); } error = kern_getsockopt(td, uap->s, uap->level, uap->name, uap->val, UIO_USERSPACE, &valsize); if (error == 0) error = copyout(&valsize, uap->avalsize, sizeof (valsize)); return (error); } /* * Kernel version of getsockopt. * optval can be a userland or userspace. optlen is always a kernel pointer. */ int kern_getsockopt(struct thread *td, int s, int level, int name, void *val, enum uio_seg valseg, socklen_t *valsize) { struct socket *so; struct file *fp; struct sockopt sopt; cap_rights_t rights; int error; if (val == NULL) *valsize = 0; if ((int)*valsize < 0) return (EINVAL); sopt.sopt_dir = SOPT_GET; sopt.sopt_level = level; sopt.sopt_name = name; sopt.sopt_val = val; sopt.sopt_valsize = (size_t)*valsize; /* checked non-negative above */ switch (valseg) { case UIO_USERSPACE: sopt.sopt_td = td; break; case UIO_SYSSPACE: sopt.sopt_td = NULL; break; default: panic("kern_getsockopt called with bad valseg"); } AUDIT_ARG_FD(s); error = getsock_cap(td, s, cap_rights_init(&rights, CAP_GETSOCKOPT), &fp, NULL, NULL); if (error == 0) { so = fp->f_data; error = sogetopt(so, &sopt); *valsize = sopt.sopt_valsize; fdrop(fp, td); } return (error); } /* * getsockname1() - Get socket name. */ static int getsockname1(struct thread *td, struct getsockname_args *uap, int compat) { struct sockaddr *sa; socklen_t len; int error; error = copyin(uap->alen, &len, sizeof(len)); if (error != 0) return (error); error = kern_getsockname(td, uap->fdes, &sa, &len); if (error != 0) return (error); if (len != 0) { #ifdef COMPAT_OLDSOCK if (compat) ((struct osockaddr *)sa)->sa_family = sa->sa_family; #endif error = copyout(sa, uap->asa, (u_int)len); } free(sa, M_SONAME); if (error == 0) error = copyout(&len, uap->alen, sizeof(len)); return (error); } int kern_getsockname(struct thread *td, int fd, struct sockaddr **sa, socklen_t *alen) { struct socket *so; struct file *fp; cap_rights_t rights; socklen_t len; int error; AUDIT_ARG_FD(fd); error = getsock_cap(td, fd, cap_rights_init(&rights, CAP_GETSOCKNAME), &fp, NULL, NULL); if (error != 0) return (error); so = fp->f_data; *sa = NULL; CURVNET_SET(so->so_vnet); error = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, sa); CURVNET_RESTORE(); if (error != 0) goto bad; if (*sa == NULL) len = 0; else len = MIN(*alen, (*sa)->sa_len); *alen = len; #ifdef KTRACE if (KTRPOINT(td, KTR_STRUCT)) ktrsockaddr(*sa); #endif bad: fdrop(fp, td); if (error != 0 && *sa != NULL) { free(*sa, M_SONAME); *sa = NULL; } return (error); } int sys_getsockname(struct thread *td, struct getsockname_args *uap) { return (getsockname1(td, uap, 0)); } #ifdef COMPAT_OLDSOCK int ogetsockname(struct thread *td, struct getsockname_args *uap) { return (getsockname1(td, uap, 1)); } #endif /* COMPAT_OLDSOCK */ /* * getpeername1() - Get name of peer for connected socket. */ static int getpeername1(struct thread *td, struct getpeername_args *uap, int compat) { struct sockaddr *sa; socklen_t len; int error; error = copyin(uap->alen, &len, sizeof (len)); if (error != 0) return (error); error = kern_getpeername(td, uap->fdes, &sa, &len); if (error != 0) return (error); if (len != 0) { #ifdef COMPAT_OLDSOCK if (compat) ((struct osockaddr *)sa)->sa_family = sa->sa_family; #endif error = copyout(sa, uap->asa, (u_int)len); } free(sa, M_SONAME); if (error == 0) error = copyout(&len, uap->alen, sizeof(len)); return (error); } int kern_getpeername(struct thread *td, int fd, struct sockaddr **sa, socklen_t *alen) { struct socket *so; struct file *fp; cap_rights_t rights; socklen_t len; int error; AUDIT_ARG_FD(fd); error = getsock_cap(td, fd, cap_rights_init(&rights, CAP_GETPEERNAME), &fp, NULL, NULL); if (error != 0) return (error); so = fp->f_data; if ((so->so_state & (SS_ISCONNECTED|SS_ISCONFIRMING)) == 0) { error = ENOTCONN; goto done; } *sa = NULL; CURVNET_SET(so->so_vnet); error = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so, sa); CURVNET_RESTORE(); if (error != 0) goto bad; if (*sa == NULL) len = 0; else len = MIN(*alen, (*sa)->sa_len); *alen = len; #ifdef KTRACE if (KTRPOINT(td, KTR_STRUCT)) ktrsockaddr(*sa); #endif bad: if (error != 0 && *sa != NULL) { free(*sa, M_SONAME); *sa = NULL; } done: fdrop(fp, td); return (error); } int sys_getpeername(struct thread *td, struct getpeername_args *uap) { return (getpeername1(td, uap, 0)); } #ifdef COMPAT_OLDSOCK int ogetpeername(struct thread *td, struct ogetpeername_args *uap) { /* XXX uap should have type `getpeername_args *' to begin with. */ return (getpeername1(td, (struct getpeername_args *)uap, 1)); } #endif /* COMPAT_OLDSOCK */ static int sockargs(struct mbuf **mp, char *buf, socklen_t buflen, int type) { struct sockaddr *sa; struct mbuf *m; int error; if (buflen > MLEN) { #ifdef COMPAT_OLDSOCK if (type == MT_SONAME && buflen <= 112) buflen = MLEN; /* unix domain compat. hack */ else #endif if (buflen > MCLBYTES) return (EINVAL); } m = m_get2(buflen, M_WAITOK, type, 0); m->m_len = buflen; error = copyin(buf, mtod(m, void *), buflen); if (error != 0) (void) m_free(m); else { *mp = m; if (type == MT_SONAME) { sa = mtod(m, struct sockaddr *); #if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN if (sa->sa_family == 0 && sa->sa_len < AF_MAX) sa->sa_family = sa->sa_len; #endif sa->sa_len = buflen; } } return (error); } int getsockaddr(struct sockaddr **namp, caddr_t uaddr, size_t len) { struct sockaddr *sa; int error; if (len > SOCK_MAXADDRLEN) return (ENAMETOOLONG); if (len < offsetof(struct sockaddr, sa_data[0])) return (EINVAL); sa = malloc(len, M_SONAME, M_WAITOK); error = copyin(uaddr, sa, len); if (error != 0) { free(sa, M_SONAME); } else { #if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN if (sa->sa_family == 0 && sa->sa_len < AF_MAX) sa->sa_family = sa->sa_len; #endif sa->sa_len = len; *namp = sa; } return (error); } Index: head/sys/kern/uipc_usrreq.c =================================================================== --- head/sys/kern/uipc_usrreq.c (revision 319721) +++ head/sys/kern/uipc_usrreq.c (revision 319722) @@ -1,2586 +1,2580 @@ /*- * Copyright (c) 1982, 1986, 1989, 1991, 1993 * The Regents of the University of California. * Copyright (c) 2004-2009 Robert N. M. Watson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * From: @(#)uipc_usrreq.c 8.3 (Berkeley) 1/4/94 */ /* * UNIX Domain (Local) Sockets * * This is an implementation of UNIX (local) domain sockets. Each socket has * an associated struct unpcb (UNIX protocol control block). Stream sockets * may be connected to 0 or 1 other socket. Datagram sockets may be * connected to 0, 1, or many other sockets. Sockets may be created and * connected in pairs (socketpair(2)), or bound/connected to using the file * system name space. For most purposes, only the receive socket buffer is * used, as sending on one socket delivers directly to the receive socket * buffer of a second socket. * * The implementation is substantially complicated by the fact that * "ancillary data", such as file descriptors or credentials, may be passed * across UNIX domain sockets. The potential for passing UNIX domain sockets * over other UNIX domain sockets requires the implementation of a simple * garbage collector to find and tear down cycles of disconnected sockets. * * TODO: * RDM * rethink name space problems * need a proper out-of-band */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include #include #include #include #include /* XXX must be before */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #endif #include #include MALLOC_DECLARE(M_FILECAPS); /* * Locking key: * (l) Locked using list lock * (g) Locked using linkage lock */ static uma_zone_t unp_zone; static unp_gen_t unp_gencnt; /* (l) */ static u_int unp_count; /* (l) Count of local sockets. */ static ino_t unp_ino; /* Prototype for fake inode numbers. */ static int unp_rights; /* (g) File descriptors in flight. */ static struct unp_head unp_shead; /* (l) List of stream sockets. */ static struct unp_head unp_dhead; /* (l) List of datagram sockets. */ static struct unp_head unp_sphead; /* (l) List of seqpacket sockets. */ struct unp_defer { SLIST_ENTRY(unp_defer) ud_link; struct file *ud_fp; }; static SLIST_HEAD(, unp_defer) unp_defers; static int unp_defers_count; static const struct sockaddr sun_noname = { sizeof(sun_noname), AF_LOCAL }; /* * Garbage collection of cyclic file descriptor/socket references occurs * asynchronously in a taskqueue context in order to avoid recursion and * reentrance in the UNIX domain socket, file descriptor, and socket layer * code. See unp_gc() for a full description. */ static struct timeout_task unp_gc_task; /* * The close of unix domain sockets attached as SCM_RIGHTS is * postponed to the taskqueue, to avoid arbitrary recursion depth. * The attached sockets might have another sockets attached. */ static struct task unp_defer_task; /* * Both send and receive buffers are allocated PIPSIZ bytes of buffering for * stream sockets, although the total for sender and receiver is actually * only PIPSIZ. * * Datagram sockets really use the sendspace as the maximum datagram size, * and don't really want to reserve the sendspace. Their recvspace should be * large enough for at least one max-size datagram plus address. */ #ifndef PIPSIZ #define PIPSIZ 8192 #endif static u_long unpst_sendspace = PIPSIZ; static u_long unpst_recvspace = PIPSIZ; static u_long unpdg_sendspace = 2*1024; /* really max datagram size */ static u_long unpdg_recvspace = 4*1024; static u_long unpsp_sendspace = PIPSIZ; /* really max datagram size */ static u_long unpsp_recvspace = PIPSIZ; static SYSCTL_NODE(_net, PF_LOCAL, local, CTLFLAG_RW, 0, "Local domain"); static SYSCTL_NODE(_net_local, SOCK_STREAM, stream, CTLFLAG_RW, 0, "SOCK_STREAM"); static SYSCTL_NODE(_net_local, SOCK_DGRAM, dgram, CTLFLAG_RW, 0, "SOCK_DGRAM"); static SYSCTL_NODE(_net_local, SOCK_SEQPACKET, seqpacket, CTLFLAG_RW, 0, "SOCK_SEQPACKET"); SYSCTL_ULONG(_net_local_stream, OID_AUTO, sendspace, CTLFLAG_RW, &unpst_sendspace, 0, "Default stream send space."); SYSCTL_ULONG(_net_local_stream, OID_AUTO, recvspace, CTLFLAG_RW, &unpst_recvspace, 0, "Default stream receive space."); SYSCTL_ULONG(_net_local_dgram, OID_AUTO, maxdgram, CTLFLAG_RW, &unpdg_sendspace, 0, "Default datagram send space."); SYSCTL_ULONG(_net_local_dgram, OID_AUTO, recvspace, CTLFLAG_RW, &unpdg_recvspace, 0, "Default datagram receive space."); SYSCTL_ULONG(_net_local_seqpacket, OID_AUTO, maxseqpacket, CTLFLAG_RW, &unpsp_sendspace, 0, "Default seqpacket send space."); SYSCTL_ULONG(_net_local_seqpacket, OID_AUTO, recvspace, CTLFLAG_RW, &unpsp_recvspace, 0, "Default seqpacket receive space."); SYSCTL_INT(_net_local, OID_AUTO, inflight, CTLFLAG_RD, &unp_rights, 0, "File descriptors in flight."); SYSCTL_INT(_net_local, OID_AUTO, deferred, CTLFLAG_RD, &unp_defers_count, 0, "File descriptors deferred to taskqueue for close."); /* * Locking and synchronization: * - * Three types of locks exit in the local domain socket implementation: a - * global list mutex, a global linkage rwlock, and per-unpcb mutexes. Of the - * global locks, the list lock protects the socket count, global generation - * number, and stream/datagram global lists. The linkage lock protects the + * Two types of locks exist in the local domain socket implementation: a + * a global linkage rwlock and per-unpcb mutexes. The linkage lock protects + * the socket count, global generation number, stream/datagram global lists and * interconnection of unpcbs, the v_socket and unp_vnode pointers, and can be * held exclusively over the acquisition of multiple unpcb locks to prevent * deadlock. * * UNIX domain sockets each have an unpcb hung off of their so_pcb pointer, * allocated in pru_attach() and freed in pru_detach(). The validity of that * pointer is an invariant, so no lock is required to dereference the so_pcb * pointer if a valid socket reference is held by the caller. In practice, * this is always true during operations performed on a socket. Each unpcb * has a back-pointer to its socket, unp_socket, which will be stable under * the same circumstances. * * This pointer may only be safely dereferenced as long as a valid reference * to the unpcb is held. Typically, this reference will be from the socket, * or from another unpcb when the referring unpcb's lock is held (in order * that the reference not be invalidated during use). For example, to follow * unp->unp_conn->unp_socket, you need unlock the lock on unp, not unp_conn, * as unp_socket remains valid as long as the reference to unp_conn is valid. * * Fields of unpcbss are locked using a per-unpcb lock, unp_mtx. Individual * atomic reads without the lock may be performed "lockless", but more * complex reads and read-modify-writes require the mutex to be held. No * lock order is defined between unpcb locks -- multiple unpcb locks may be * acquired at the same time only when holding the linkage rwlock * exclusively, which prevents deadlocks. * * Blocking with UNIX domain sockets is a tricky issue: unlike most network * protocols, bind() is a non-atomic operation, and connect() requires * potential sleeping in the protocol, due to potentially waiting on local or * distributed file systems. We try to separate "lookup" operations, which * may sleep, and the IPC operations themselves, which typically can occur * with relative atomicity as locks can be held over the entire operation. * * Another tricky issue is simultaneous multi-threaded or multi-process * access to a single UNIX domain socket. These are handled by the flags * UNP_CONNECTING and UNP_BINDING, which prevent concurrent connecting or * binding, both of which involve dropping UNIX domain socket locks in order * to perform namei() and other file system operations. */ static struct rwlock unp_link_rwlock; -static struct mtx unp_list_lock; static struct mtx unp_defers_lock; #define UNP_LINK_LOCK_INIT() rw_init(&unp_link_rwlock, \ "unp_link_rwlock") #define UNP_LINK_LOCK_ASSERT() rw_assert(&unp_link_rwlock, \ RA_LOCKED) #define UNP_LINK_UNLOCK_ASSERT() rw_assert(&unp_link_rwlock, \ RA_UNLOCKED) #define UNP_LINK_RLOCK() rw_rlock(&unp_link_rwlock) #define UNP_LINK_RUNLOCK() rw_runlock(&unp_link_rwlock) #define UNP_LINK_WLOCK() rw_wlock(&unp_link_rwlock) #define UNP_LINK_WUNLOCK() rw_wunlock(&unp_link_rwlock) #define UNP_LINK_WLOCK_ASSERT() rw_assert(&unp_link_rwlock, \ RA_WLOCKED) +#define UNP_LINK_WOWNED() rw_wowned(&unp_link_rwlock) -#define UNP_LIST_LOCK_INIT() mtx_init(&unp_list_lock, \ - "unp_list_lock", NULL, MTX_DEF) -#define UNP_LIST_LOCK() mtx_lock(&unp_list_lock) -#define UNP_LIST_UNLOCK() mtx_unlock(&unp_list_lock) - #define UNP_DEFERRED_LOCK_INIT() mtx_init(&unp_defers_lock, \ "unp_defer", NULL, MTX_DEF) #define UNP_DEFERRED_LOCK() mtx_lock(&unp_defers_lock) #define UNP_DEFERRED_UNLOCK() mtx_unlock(&unp_defers_lock) #define UNP_PCB_LOCK_INIT(unp) mtx_init(&(unp)->unp_mtx, \ "unp_mtx", "unp_mtx", \ MTX_DUPOK|MTX_DEF|MTX_RECURSE) #define UNP_PCB_LOCK_DESTROY(unp) mtx_destroy(&(unp)->unp_mtx) #define UNP_PCB_LOCK(unp) mtx_lock(&(unp)->unp_mtx) #define UNP_PCB_UNLOCK(unp) mtx_unlock(&(unp)->unp_mtx) #define UNP_PCB_LOCK_ASSERT(unp) mtx_assert(&(unp)->unp_mtx, MA_OWNED) static int uipc_connect2(struct socket *, struct socket *); static int uipc_ctloutput(struct socket *, struct sockopt *); static int unp_connect(struct socket *, struct sockaddr *, struct thread *); static int unp_connectat(int, struct socket *, struct sockaddr *, struct thread *); static int unp_connect2(struct socket *so, struct socket *so2, int); static void unp_disconnect(struct unpcb *unp, struct unpcb *unp2); static void unp_dispose(struct socket *so); static void unp_dispose_mbuf(struct mbuf *); static void unp_shutdown(struct unpcb *); static void unp_drop(struct unpcb *); static void unp_gc(__unused void *, int); static void unp_scan(struct mbuf *, void (*)(struct filedescent **, int)); static void unp_discard(struct file *); static void unp_freerights(struct filedescent **, int); static void unp_init(void); static int unp_internalize(struct mbuf **, struct thread *); static void unp_internalize_fp(struct file *); static int unp_externalize(struct mbuf *, struct mbuf **, int); static int unp_externalize_fp(struct file *); static struct mbuf *unp_addsockcred(struct thread *, struct mbuf *); static void unp_process_defers(void * __unused, int); /* * Definitions of protocols supported in the LOCAL domain. */ static struct domain localdomain; static struct pr_usrreqs uipc_usrreqs_dgram, uipc_usrreqs_stream; static struct pr_usrreqs uipc_usrreqs_seqpacket; static struct protosw localsw[] = { { .pr_type = SOCK_STREAM, .pr_domain = &localdomain, .pr_flags = PR_CONNREQUIRED|PR_WANTRCVD|PR_RIGHTS, .pr_ctloutput = &uipc_ctloutput, .pr_usrreqs = &uipc_usrreqs_stream }, { .pr_type = SOCK_DGRAM, .pr_domain = &localdomain, .pr_flags = PR_ATOMIC|PR_ADDR|PR_RIGHTS, .pr_ctloutput = &uipc_ctloutput, .pr_usrreqs = &uipc_usrreqs_dgram }, { .pr_type = SOCK_SEQPACKET, .pr_domain = &localdomain, /* * XXXRW: For now, PR_ADDR because soreceive will bump into them * due to our use of sbappendaddr. A new sbappend variants is needed * that supports both atomic record writes and control data. */ .pr_flags = PR_ADDR|PR_ATOMIC|PR_CONNREQUIRED|PR_WANTRCVD| PR_RIGHTS, .pr_ctloutput = &uipc_ctloutput, .pr_usrreqs = &uipc_usrreqs_seqpacket, }, }; static struct domain localdomain = { .dom_family = AF_LOCAL, .dom_name = "local", .dom_init = unp_init, .dom_externalize = unp_externalize, .dom_dispose = unp_dispose, .dom_protosw = localsw, .dom_protoswNPROTOSW = &localsw[nitems(localsw)] }; DOMAIN_SET(local); static void uipc_abort(struct socket *so) { struct unpcb *unp, *unp2; unp = sotounpcb(so); KASSERT(unp != NULL, ("uipc_abort: unp == NULL")); UNP_LINK_WLOCK(); UNP_PCB_LOCK(unp); unp2 = unp->unp_conn; if (unp2 != NULL) { UNP_PCB_LOCK(unp2); unp_drop(unp2); UNP_PCB_UNLOCK(unp2); } UNP_PCB_UNLOCK(unp); UNP_LINK_WUNLOCK(); } static int uipc_accept(struct socket *so, struct sockaddr **nam) { struct unpcb *unp, *unp2; const struct sockaddr *sa; /* * Pass back name of connected socket, if it was bound and we are * still connected (our peer may have closed already!). */ unp = sotounpcb(so); KASSERT(unp != NULL, ("uipc_accept: unp == NULL")); *nam = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK); UNP_LINK_RLOCK(); unp2 = unp->unp_conn; if (unp2 != NULL && unp2->unp_addr != NULL) { UNP_PCB_LOCK(unp2); sa = (struct sockaddr *) unp2->unp_addr; bcopy(sa, *nam, sa->sa_len); UNP_PCB_UNLOCK(unp2); } else { sa = &sun_noname; bcopy(sa, *nam, sa->sa_len); } UNP_LINK_RUNLOCK(); return (0); } static int uipc_attach(struct socket *so, int proto, struct thread *td) { u_long sendspace, recvspace; struct unpcb *unp; int error; + bool locked; KASSERT(so->so_pcb == NULL, ("uipc_attach: so_pcb != NULL")); if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) { switch (so->so_type) { case SOCK_STREAM: sendspace = unpst_sendspace; recvspace = unpst_recvspace; break; case SOCK_DGRAM: sendspace = unpdg_sendspace; recvspace = unpdg_recvspace; break; case SOCK_SEQPACKET: sendspace = unpsp_sendspace; recvspace = unpsp_recvspace; break; default: panic("uipc_attach"); } error = soreserve(so, sendspace, recvspace); if (error) return (error); } unp = uma_zalloc(unp_zone, M_NOWAIT | M_ZERO); if (unp == NULL) return (ENOBUFS); LIST_INIT(&unp->unp_refs); UNP_PCB_LOCK_INIT(unp); unp->unp_socket = so; so->so_pcb = unp; unp->unp_refcount = 1; - if (so->so_head != NULL) + if (so->so_listen != NULL) unp->unp_flags |= UNP_NASCENT; - UNP_LIST_LOCK(); + if ((locked = UNP_LINK_WOWNED()) == false) + UNP_LINK_WLOCK(); + unp->unp_gencnt = ++unp_gencnt; unp_count++; switch (so->so_type) { case SOCK_STREAM: LIST_INSERT_HEAD(&unp_shead, unp, unp_link); break; case SOCK_DGRAM: LIST_INSERT_HEAD(&unp_dhead, unp, unp_link); break; case SOCK_SEQPACKET: LIST_INSERT_HEAD(&unp_sphead, unp, unp_link); break; default: panic("uipc_attach"); } - UNP_LIST_UNLOCK(); + if (locked == false) + UNP_LINK_WUNLOCK(); + return (0); } static int uipc_bindat(int fd, struct socket *so, struct sockaddr *nam, struct thread *td) { struct sockaddr_un *soun = (struct sockaddr_un *)nam; struct vattr vattr; int error, namelen; struct nameidata nd; struct unpcb *unp; struct vnode *vp; struct mount *mp; cap_rights_t rights; char *buf; if (nam->sa_family != AF_UNIX) return (EAFNOSUPPORT); unp = sotounpcb(so); KASSERT(unp != NULL, ("uipc_bind: unp == NULL")); if (soun->sun_len > sizeof(struct sockaddr_un)) return (EINVAL); namelen = soun->sun_len - offsetof(struct sockaddr_un, sun_path); if (namelen <= 0) return (EINVAL); /* * We don't allow simultaneous bind() calls on a single UNIX domain * socket, so flag in-progress operations, and return an error if an * operation is already in progress. * * Historically, we have not allowed a socket to be rebound, so this * also returns an error. Not allowing re-binding simplifies the * implementation and avoids a great many possible failure modes. */ UNP_PCB_LOCK(unp); if (unp->unp_vnode != NULL) { UNP_PCB_UNLOCK(unp); return (EINVAL); } if (unp->unp_flags & UNP_BINDING) { UNP_PCB_UNLOCK(unp); return (EALREADY); } unp->unp_flags |= UNP_BINDING; UNP_PCB_UNLOCK(unp); buf = malloc(namelen + 1, M_TEMP, M_WAITOK); bcopy(soun->sun_path, buf, namelen); buf[namelen] = 0; restart: NDINIT_ATRIGHTS(&nd, CREATE, NOFOLLOW | LOCKPARENT | SAVENAME | NOCACHE, UIO_SYSSPACE, buf, fd, cap_rights_init(&rights, CAP_BINDAT), td); /* SHOULD BE ABLE TO ADOPT EXISTING AND wakeup() ALA FIFO's */ error = namei(&nd); if (error) goto error; vp = nd.ni_vp; if (vp != NULL || vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { NDFREE(&nd, NDF_ONLY_PNBUF); if (nd.ni_dvp == vp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); if (vp != NULL) { vrele(vp); error = EADDRINUSE; goto error; } error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH); if (error) goto error; goto restart; } VATTR_NULL(&vattr); vattr.va_type = VSOCK; vattr.va_mode = (ACCESSPERMS & ~td->td_proc->p_fd->fd_cmask); #ifdef MAC error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd, &vattr); #endif if (error == 0) error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr); NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); if (error) { vn_finished_write(mp); goto error; } vp = nd.ni_vp; ASSERT_VOP_ELOCKED(vp, "uipc_bind"); soun = (struct sockaddr_un *)sodupsockaddr(nam, M_WAITOK); UNP_LINK_WLOCK(); UNP_PCB_LOCK(unp); VOP_UNP_BIND(vp, unp); unp->unp_vnode = vp; unp->unp_addr = soun; unp->unp_flags &= ~UNP_BINDING; UNP_PCB_UNLOCK(unp); UNP_LINK_WUNLOCK(); VOP_UNLOCK(vp, 0); vn_finished_write(mp); free(buf, M_TEMP); return (0); error: UNP_PCB_LOCK(unp); unp->unp_flags &= ~UNP_BINDING; UNP_PCB_UNLOCK(unp); free(buf, M_TEMP); return (error); } static int uipc_bind(struct socket *so, struct sockaddr *nam, struct thread *td) { return (uipc_bindat(AT_FDCWD, so, nam, td)); } static int uipc_connect(struct socket *so, struct sockaddr *nam, struct thread *td) { int error; KASSERT(td == curthread, ("uipc_connect: td != curthread")); UNP_LINK_WLOCK(); error = unp_connect(so, nam, td); UNP_LINK_WUNLOCK(); return (error); } static int uipc_connectat(int fd, struct socket *so, struct sockaddr *nam, struct thread *td) { int error; KASSERT(td == curthread, ("uipc_connectat: td != curthread")); UNP_LINK_WLOCK(); error = unp_connectat(fd, so, nam, td); UNP_LINK_WUNLOCK(); return (error); } static void uipc_close(struct socket *so) { struct unpcb *unp, *unp2; + struct vnode *vp = NULL; unp = sotounpcb(so); KASSERT(unp != NULL, ("uipc_close: unp == NULL")); UNP_LINK_WLOCK(); UNP_PCB_LOCK(unp); unp2 = unp->unp_conn; if (unp2 != NULL) { UNP_PCB_LOCK(unp2); unp_disconnect(unp, unp2); UNP_PCB_UNLOCK(unp2); } + if (SOLISTENING(so) && ((vp = unp->unp_vnode) != NULL)) { + VOP_UNP_DETACH(vp); + unp->unp_vnode = NULL; + } UNP_PCB_UNLOCK(unp); UNP_LINK_WUNLOCK(); + if (vp) + vrele(vp); } static int uipc_connect2(struct socket *so1, struct socket *so2) { struct unpcb *unp, *unp2; int error; UNP_LINK_WLOCK(); unp = so1->so_pcb; KASSERT(unp != NULL, ("uipc_connect2: unp == NULL")); UNP_PCB_LOCK(unp); unp2 = so2->so_pcb; KASSERT(unp2 != NULL, ("uipc_connect2: unp2 == NULL")); UNP_PCB_LOCK(unp2); error = unp_connect2(so1, so2, PRU_CONNECT2); UNP_PCB_UNLOCK(unp2); UNP_PCB_UNLOCK(unp); UNP_LINK_WUNLOCK(); return (error); } static void uipc_detach(struct socket *so) { struct unpcb *unp, *unp2; struct sockaddr_un *saved_unp_addr; struct vnode *vp; int freeunp, local_unp_rights; unp = sotounpcb(so); KASSERT(unp != NULL, ("uipc_detach: unp == NULL")); vp = NULL; local_unp_rights = 0; - UNP_LIST_LOCK(); + UNP_LINK_WLOCK(); LIST_REMOVE(unp, unp_link); unp->unp_gencnt = ++unp_gencnt; --unp_count; - UNP_LIST_UNLOCK(); - - if ((unp->unp_flags & UNP_NASCENT) != 0) { - UNP_PCB_LOCK(unp); - goto teardown; - } - UNP_LINK_WLOCK(); UNP_PCB_LOCK(unp); + if ((unp->unp_flags & UNP_NASCENT) != 0) + goto teardown; if ((vp = unp->unp_vnode) != NULL) { VOP_UNP_DETACH(vp); unp->unp_vnode = NULL; } unp2 = unp->unp_conn; if (unp2 != NULL) { UNP_PCB_LOCK(unp2); unp_disconnect(unp, unp2); UNP_PCB_UNLOCK(unp2); } /* * We hold the linkage lock exclusively, so it's OK to acquire * multiple pcb locks at a time. */ while (!LIST_EMPTY(&unp->unp_refs)) { struct unpcb *ref = LIST_FIRST(&unp->unp_refs); UNP_PCB_LOCK(ref); unp_drop(ref); UNP_PCB_UNLOCK(ref); } local_unp_rights = unp_rights; - UNP_LINK_WUNLOCK(); teardown: + UNP_LINK_WUNLOCK(); unp->unp_socket->so_pcb = NULL; saved_unp_addr = unp->unp_addr; unp->unp_addr = NULL; unp->unp_refcount--; freeunp = (unp->unp_refcount == 0); if (saved_unp_addr != NULL) free(saved_unp_addr, M_SONAME); if (freeunp) { UNP_PCB_LOCK_DESTROY(unp); uma_zfree(unp_zone, unp); } else UNP_PCB_UNLOCK(unp); if (vp) vrele(vp); if (local_unp_rights) taskqueue_enqueue_timeout(taskqueue_thread, &unp_gc_task, -1); } static int uipc_disconnect(struct socket *so) { struct unpcb *unp, *unp2; unp = sotounpcb(so); KASSERT(unp != NULL, ("uipc_disconnect: unp == NULL")); UNP_LINK_WLOCK(); UNP_PCB_LOCK(unp); unp2 = unp->unp_conn; if (unp2 != NULL) { UNP_PCB_LOCK(unp2); unp_disconnect(unp, unp2); UNP_PCB_UNLOCK(unp2); } UNP_PCB_UNLOCK(unp); UNP_LINK_WUNLOCK(); return (0); } static int uipc_listen(struct socket *so, int backlog, struct thread *td) { struct unpcb *unp; int error; if (so->so_type != SOCK_STREAM && so->so_type != SOCK_SEQPACKET) return (EOPNOTSUPP); unp = sotounpcb(so); KASSERT(unp != NULL, ("uipc_listen: unp == NULL")); UNP_PCB_LOCK(unp); if (unp->unp_vnode == NULL) { /* Already connected or not bound to an address. */ error = unp->unp_conn != NULL ? EINVAL : EDESTADDRREQ; UNP_PCB_UNLOCK(unp); return (error); } SOCK_LOCK(so); error = solisten_proto_check(so); if (error == 0) { cru2x(td->td_ucred, &unp->unp_peercred); solisten_proto(so, backlog); } SOCK_UNLOCK(so); UNP_PCB_UNLOCK(unp); return (error); } static int uipc_peeraddr(struct socket *so, struct sockaddr **nam) { struct unpcb *unp, *unp2; const struct sockaddr *sa; unp = sotounpcb(so); KASSERT(unp != NULL, ("uipc_peeraddr: unp == NULL")); *nam = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK); UNP_LINK_RLOCK(); /* * XXX: It seems that this test always fails even when connection is * established. So, this else clause is added as workaround to * return PF_LOCAL sockaddr. */ unp2 = unp->unp_conn; if (unp2 != NULL) { UNP_PCB_LOCK(unp2); if (unp2->unp_addr != NULL) sa = (struct sockaddr *) unp2->unp_addr; else sa = &sun_noname; bcopy(sa, *nam, sa->sa_len); UNP_PCB_UNLOCK(unp2); } else { sa = &sun_noname; bcopy(sa, *nam, sa->sa_len); } UNP_LINK_RUNLOCK(); return (0); } static int uipc_rcvd(struct socket *so, int flags) { struct unpcb *unp, *unp2; struct socket *so2; u_int mbcnt, sbcc; unp = sotounpcb(so); KASSERT(unp != NULL, ("%s: unp == NULL", __func__)); KASSERT(so->so_type == SOCK_STREAM || so->so_type == SOCK_SEQPACKET, ("%s: socktype %d", __func__, so->so_type)); /* * Adjust backpressure on sender and wakeup any waiting to write. * * The unp lock is acquired to maintain the validity of the unp_conn * pointer; no lock on unp2 is required as unp2->unp_socket will be * static as long as we don't permit unp2 to disconnect from unp, * which is prevented by the lock on unp. We cache values from * so_rcv to avoid holding the so_rcv lock over the entire * transaction on the remote so_snd. */ SOCKBUF_LOCK(&so->so_rcv); mbcnt = so->so_rcv.sb_mbcnt; sbcc = sbavail(&so->so_rcv); SOCKBUF_UNLOCK(&so->so_rcv); /* * There is a benign race condition at this point. If we're planning to * clear SB_STOP, but uipc_send is called on the connected socket at * this instant, it might add data to the sockbuf and set SB_STOP. Then * we would erroneously clear SB_STOP below, even though the sockbuf is * full. The race is benign because the only ill effect is to allow the * sockbuf to exceed its size limit, and the size limits are not * strictly guaranteed anyway. */ UNP_PCB_LOCK(unp); unp2 = unp->unp_conn; if (unp2 == NULL) { UNP_PCB_UNLOCK(unp); return (0); } so2 = unp2->unp_socket; SOCKBUF_LOCK(&so2->so_snd); if (sbcc < so2->so_snd.sb_hiwat && mbcnt < so2->so_snd.sb_mbmax) so2->so_snd.sb_flags &= ~SB_STOP; sowwakeup_locked(so2); UNP_PCB_UNLOCK(unp); return (0); } static int uipc_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, struct mbuf *control, struct thread *td) { struct unpcb *unp, *unp2; struct socket *so2; u_int mbcnt, sbcc; int error = 0; unp = sotounpcb(so); KASSERT(unp != NULL, ("%s: unp == NULL", __func__)); KASSERT(so->so_type == SOCK_STREAM || so->so_type == SOCK_DGRAM || so->so_type == SOCK_SEQPACKET, ("%s: socktype %d", __func__, so->so_type)); if (flags & PRUS_OOB) { error = EOPNOTSUPP; goto release; } if (control != NULL && (error = unp_internalize(&control, td))) goto release; if ((nam != NULL) || (flags & PRUS_EOF)) UNP_LINK_WLOCK(); else UNP_LINK_RLOCK(); switch (so->so_type) { case SOCK_DGRAM: { const struct sockaddr *from; unp2 = unp->unp_conn; if (nam != NULL) { UNP_LINK_WLOCK_ASSERT(); if (unp2 != NULL) { error = EISCONN; break; } error = unp_connect(so, nam, td); if (error) break; unp2 = unp->unp_conn; } /* * Because connect() and send() are non-atomic in a sendto() * with a target address, it's possible that the socket will * have disconnected before the send() can run. In that case * return the slightly counter-intuitive but otherwise * correct error that the socket is not connected. */ if (unp2 == NULL) { error = ENOTCONN; break; } /* Lockless read. */ if (unp2->unp_flags & UNP_WANTCRED) control = unp_addsockcred(td, control); UNP_PCB_LOCK(unp); if (unp->unp_addr != NULL) from = (struct sockaddr *)unp->unp_addr; else from = &sun_noname; so2 = unp2->unp_socket; SOCKBUF_LOCK(&so2->so_rcv); if (sbappendaddr_locked(&so2->so_rcv, from, m, control)) { sorwakeup_locked(so2); m = NULL; control = NULL; } else { SOCKBUF_UNLOCK(&so2->so_rcv); error = ENOBUFS; } if (nam != NULL) { UNP_LINK_WLOCK_ASSERT(); UNP_PCB_LOCK(unp2); unp_disconnect(unp, unp2); UNP_PCB_UNLOCK(unp2); } UNP_PCB_UNLOCK(unp); break; } case SOCK_SEQPACKET: case SOCK_STREAM: if ((so->so_state & SS_ISCONNECTED) == 0) { if (nam != NULL) { UNP_LINK_WLOCK_ASSERT(); error = unp_connect(so, nam, td); if (error) break; /* XXX */ } else { error = ENOTCONN; break; } } /* Lockless read. */ if (so->so_snd.sb_state & SBS_CANTSENDMORE) { error = EPIPE; break; } /* * Because connect() and send() are non-atomic in a sendto() * with a target address, it's possible that the socket will * have disconnected before the send() can run. In that case * return the slightly counter-intuitive but otherwise * correct error that the socket is not connected. * * Locking here must be done carefully: the linkage lock * prevents interconnections between unpcbs from changing, so * we can traverse from unp to unp2 without acquiring unp's * lock. Socket buffer locks follow unpcb locks, so we can * acquire both remote and lock socket buffer locks. */ unp2 = unp->unp_conn; if (unp2 == NULL) { error = ENOTCONN; break; } so2 = unp2->unp_socket; UNP_PCB_LOCK(unp2); SOCKBUF_LOCK(&so2->so_rcv); if (unp2->unp_flags & UNP_WANTCRED) { /* * Credentials are passed only once on SOCK_STREAM * and SOCK_SEQPACKET. */ unp2->unp_flags &= ~UNP_WANTCRED; control = unp_addsockcred(td, control); } /* * Send to paired receive port, and then reduce send buffer * hiwater marks to maintain backpressure. Wake up readers. */ switch (so->so_type) { case SOCK_STREAM: if (control != NULL) { if (sbappendcontrol_locked(&so2->so_rcv, m, control)) control = NULL; } else sbappend_locked(&so2->so_rcv, m, flags); break; case SOCK_SEQPACKET: { const struct sockaddr *from; from = &sun_noname; /* * Don't check for space available in so2->so_rcv. * Unix domain sockets only check for space in the * sending sockbuf, and that check is performed one * level up the stack. */ if (sbappendaddr_nospacecheck_locked(&so2->so_rcv, from, m, control)) control = NULL; break; } } mbcnt = so2->so_rcv.sb_mbcnt; sbcc = sbavail(&so2->so_rcv); if (sbcc) sorwakeup_locked(so2); else SOCKBUF_UNLOCK(&so2->so_rcv); /* * The PCB lock on unp2 protects the SB_STOP flag. Without it, * it would be possible for uipc_rcvd to be called at this * point, drain the receiving sockbuf, clear SB_STOP, and then * we would set SB_STOP below. That could lead to an empty * sockbuf having SB_STOP set */ SOCKBUF_LOCK(&so->so_snd); if (sbcc >= so->so_snd.sb_hiwat || mbcnt >= so->so_snd.sb_mbmax) so->so_snd.sb_flags |= SB_STOP; SOCKBUF_UNLOCK(&so->so_snd); UNP_PCB_UNLOCK(unp2); m = NULL; break; } /* * PRUS_EOF is equivalent to pru_send followed by pru_shutdown. */ if (flags & PRUS_EOF) { UNP_PCB_LOCK(unp); socantsendmore(so); unp_shutdown(unp); UNP_PCB_UNLOCK(unp); } if ((nam != NULL) || (flags & PRUS_EOF)) UNP_LINK_WUNLOCK(); else UNP_LINK_RUNLOCK(); if (control != NULL && error != 0) unp_dispose_mbuf(control); release: if (control != NULL) m_freem(control); if (m != NULL) m_freem(m); return (error); } static int uipc_ready(struct socket *so, struct mbuf *m, int count) { struct unpcb *unp, *unp2; struct socket *so2; int error; unp = sotounpcb(so); UNP_LINK_RLOCK(); unp2 = unp->unp_conn; UNP_PCB_LOCK(unp2); so2 = unp2->unp_socket; SOCKBUF_LOCK(&so2->so_rcv); if ((error = sbready(&so2->so_rcv, m, count)) == 0) sorwakeup_locked(so2); else SOCKBUF_UNLOCK(&so2->so_rcv); UNP_PCB_UNLOCK(unp2); UNP_LINK_RUNLOCK(); return (error); } static int uipc_sense(struct socket *so, struct stat *sb) { struct unpcb *unp; unp = sotounpcb(so); KASSERT(unp != NULL, ("uipc_sense: unp == NULL")); sb->st_blksize = so->so_snd.sb_hiwat; UNP_PCB_LOCK(unp); sb->st_dev = NODEV; if (unp->unp_ino == 0) unp->unp_ino = (++unp_ino == 0) ? ++unp_ino : unp_ino; sb->st_ino = unp->unp_ino; UNP_PCB_UNLOCK(unp); return (0); } static int uipc_shutdown(struct socket *so) { struct unpcb *unp; unp = sotounpcb(so); KASSERT(unp != NULL, ("uipc_shutdown: unp == NULL")); UNP_LINK_WLOCK(); UNP_PCB_LOCK(unp); socantsendmore(so); unp_shutdown(unp); UNP_PCB_UNLOCK(unp); UNP_LINK_WUNLOCK(); return (0); } static int uipc_sockaddr(struct socket *so, struct sockaddr **nam) { struct unpcb *unp; const struct sockaddr *sa; unp = sotounpcb(so); KASSERT(unp != NULL, ("uipc_sockaddr: unp == NULL")); *nam = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK); UNP_PCB_LOCK(unp); if (unp->unp_addr != NULL) sa = (struct sockaddr *) unp->unp_addr; else sa = &sun_noname; bcopy(sa, *nam, sa->sa_len); UNP_PCB_UNLOCK(unp); return (0); } static struct pr_usrreqs uipc_usrreqs_dgram = { .pru_abort = uipc_abort, .pru_accept = uipc_accept, .pru_attach = uipc_attach, .pru_bind = uipc_bind, .pru_bindat = uipc_bindat, .pru_connect = uipc_connect, .pru_connectat = uipc_connectat, .pru_connect2 = uipc_connect2, .pru_detach = uipc_detach, .pru_disconnect = uipc_disconnect, .pru_listen = uipc_listen, .pru_peeraddr = uipc_peeraddr, .pru_rcvd = uipc_rcvd, .pru_send = uipc_send, .pru_sense = uipc_sense, .pru_shutdown = uipc_shutdown, .pru_sockaddr = uipc_sockaddr, .pru_soreceive = soreceive_dgram, .pru_close = uipc_close, }; static struct pr_usrreqs uipc_usrreqs_seqpacket = { .pru_abort = uipc_abort, .pru_accept = uipc_accept, .pru_attach = uipc_attach, .pru_bind = uipc_bind, .pru_bindat = uipc_bindat, .pru_connect = uipc_connect, .pru_connectat = uipc_connectat, .pru_connect2 = uipc_connect2, .pru_detach = uipc_detach, .pru_disconnect = uipc_disconnect, .pru_listen = uipc_listen, .pru_peeraddr = uipc_peeraddr, .pru_rcvd = uipc_rcvd, .pru_send = uipc_send, .pru_sense = uipc_sense, .pru_shutdown = uipc_shutdown, .pru_sockaddr = uipc_sockaddr, .pru_soreceive = soreceive_generic, /* XXX: or...? */ .pru_close = uipc_close, }; static struct pr_usrreqs uipc_usrreqs_stream = { .pru_abort = uipc_abort, .pru_accept = uipc_accept, .pru_attach = uipc_attach, .pru_bind = uipc_bind, .pru_bindat = uipc_bindat, .pru_connect = uipc_connect, .pru_connectat = uipc_connectat, .pru_connect2 = uipc_connect2, .pru_detach = uipc_detach, .pru_disconnect = uipc_disconnect, .pru_listen = uipc_listen, .pru_peeraddr = uipc_peeraddr, .pru_rcvd = uipc_rcvd, .pru_send = uipc_send, .pru_ready = uipc_ready, .pru_sense = uipc_sense, .pru_shutdown = uipc_shutdown, .pru_sockaddr = uipc_sockaddr, .pru_soreceive = soreceive_generic, .pru_close = uipc_close, }; static int uipc_ctloutput(struct socket *so, struct sockopt *sopt) { struct unpcb *unp; struct xucred xu; int error, optval; if (sopt->sopt_level != 0) return (EINVAL); unp = sotounpcb(so); KASSERT(unp != NULL, ("uipc_ctloutput: unp == NULL")); error = 0; switch (sopt->sopt_dir) { case SOPT_GET: switch (sopt->sopt_name) { case LOCAL_PEERCRED: UNP_PCB_LOCK(unp); if (unp->unp_flags & UNP_HAVEPC) xu = unp->unp_peercred; else { if (so->so_type == SOCK_STREAM) error = ENOTCONN; else error = EINVAL; } UNP_PCB_UNLOCK(unp); if (error == 0) error = sooptcopyout(sopt, &xu, sizeof(xu)); break; case LOCAL_CREDS: /* Unlocked read. */ optval = unp->unp_flags & UNP_WANTCRED ? 1 : 0; error = sooptcopyout(sopt, &optval, sizeof(optval)); break; case LOCAL_CONNWAIT: /* Unlocked read. */ optval = unp->unp_flags & UNP_CONNWAIT ? 1 : 0; error = sooptcopyout(sopt, &optval, sizeof(optval)); break; default: error = EOPNOTSUPP; break; } break; case SOPT_SET: switch (sopt->sopt_name) { case LOCAL_CREDS: case LOCAL_CONNWAIT: error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval)); if (error) break; #define OPTSET(bit) do { \ UNP_PCB_LOCK(unp); \ if (optval) \ unp->unp_flags |= bit; \ else \ unp->unp_flags &= ~bit; \ UNP_PCB_UNLOCK(unp); \ } while (0) switch (sopt->sopt_name) { case LOCAL_CREDS: OPTSET(UNP_WANTCRED); break; case LOCAL_CONNWAIT: OPTSET(UNP_CONNWAIT); break; default: break; } break; #undef OPTSET default: error = ENOPROTOOPT; break; } break; default: error = EOPNOTSUPP; break; } return (error); } static int unp_connect(struct socket *so, struct sockaddr *nam, struct thread *td) { return (unp_connectat(AT_FDCWD, so, nam, td)); } static int unp_connectat(int fd, struct socket *so, struct sockaddr *nam, struct thread *td) { struct sockaddr_un *soun = (struct sockaddr_un *)nam; struct vnode *vp; - struct socket *so2, *so3; + struct socket *so2; struct unpcb *unp, *unp2, *unp3; struct nameidata nd; char buf[SOCK_MAXADDRLEN]; struct sockaddr *sa; cap_rights_t rights; int error, len; if (nam->sa_family != AF_UNIX) return (EAFNOSUPPORT); UNP_LINK_WLOCK_ASSERT(); unp = sotounpcb(so); KASSERT(unp != NULL, ("unp_connect: unp == NULL")); if (nam->sa_len > sizeof(struct sockaddr_un)) return (EINVAL); len = nam->sa_len - offsetof(struct sockaddr_un, sun_path); if (len <= 0) return (EINVAL); bcopy(soun->sun_path, buf, len); buf[len] = 0; UNP_PCB_LOCK(unp); if (unp->unp_flags & UNP_CONNECTING) { UNP_PCB_UNLOCK(unp); return (EALREADY); } UNP_LINK_WUNLOCK(); unp->unp_flags |= UNP_CONNECTING; UNP_PCB_UNLOCK(unp); sa = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK); NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF, UIO_SYSSPACE, buf, fd, cap_rights_init(&rights, CAP_CONNECTAT), td); error = namei(&nd); if (error) vp = NULL; else vp = nd.ni_vp; ASSERT_VOP_LOCKED(vp, "unp_connect"); NDFREE(&nd, NDF_ONLY_PNBUF); if (error) goto bad; if (vp->v_type != VSOCK) { error = ENOTSOCK; goto bad; } #ifdef MAC error = mac_vnode_check_open(td->td_ucred, vp, VWRITE | VREAD); if (error) goto bad; #endif error = VOP_ACCESS(vp, VWRITE, td->td_ucred, td); if (error) goto bad; unp = sotounpcb(so); KASSERT(unp != NULL, ("unp_connect: unp == NULL")); /* * Lock linkage lock for two reasons: make sure v_socket is stable, * and to protect simultaneous locking of multiple pcbs. */ UNP_LINK_WLOCK(); VOP_UNP_CONNECT(vp, &unp2); if (unp2 == NULL) { error = ECONNREFUSED; goto bad2; } so2 = unp2->unp_socket; if (so->so_type != so2->so_type) { error = EPROTOTYPE; goto bad2; } + UNP_PCB_LOCK(unp); + UNP_PCB_LOCK(unp2); if (so->so_proto->pr_flags & PR_CONNREQUIRED) { if (so2->so_options & SO_ACCEPTCONN) { CURVNET_SET(so2->so_vnet); - so3 = sonewconn(so2, 0); + so2 = sonewconn(so2, 0); CURVNET_RESTORE(); } else - so3 = NULL; - if (so3 == NULL) { + so2 = NULL; + if (so2 == NULL) { error = ECONNREFUSED; - goto bad2; + goto bad3; } - unp = sotounpcb(so); - unp2 = sotounpcb(so2); - unp3 = sotounpcb(so3); - UNP_PCB_LOCK(unp); - UNP_PCB_LOCK(unp2); + unp3 = sotounpcb(so2); UNP_PCB_LOCK(unp3); if (unp2->unp_addr != NULL) { bcopy(unp2->unp_addr, sa, unp2->unp_addr->sun_len); unp3->unp_addr = (struct sockaddr_un *) sa; sa = NULL; } /* * The connector's (client's) credentials are copied from its * process structure at the time of connect() (which is now). */ cru2x(td->td_ucred, &unp3->unp_peercred); unp3->unp_flags |= UNP_HAVEPC; /* * The receiver's (server's) credentials are copied from the * unp_peercred member of socket on which the former called * listen(); uipc_listen() cached that process's credentials * at that time so we can use them now. */ memcpy(&unp->unp_peercred, &unp2->unp_peercred, sizeof(unp->unp_peercred)); unp->unp_flags |= UNP_HAVEPC; if (unp2->unp_flags & UNP_WANTCRED) unp3->unp_flags |= UNP_WANTCRED; - UNP_PCB_UNLOCK(unp3); UNP_PCB_UNLOCK(unp2); - UNP_PCB_UNLOCK(unp); + unp2 = unp3; #ifdef MAC - mac_socketpeer_set_from_socket(so, so3); - mac_socketpeer_set_from_socket(so3, so); + mac_socketpeer_set_from_socket(so, so2); + mac_socketpeer_set_from_socket(so2, so); #endif - - so2 = so3; } - unp = sotounpcb(so); - KASSERT(unp != NULL, ("unp_connect: unp == NULL")); - unp2 = sotounpcb(so2); - KASSERT(unp2 != NULL, ("unp_connect: unp2 == NULL")); - UNP_PCB_LOCK(unp); - UNP_PCB_LOCK(unp2); + + KASSERT(unp2 != NULL && so2 != NULL && unp2->unp_socket == so2 && + sotounpcb(so2) == unp2, + ("%s: unp2 %p so2 %p", __func__, unp2, so2)); error = unp_connect2(so, so2, PRU_CONNECT); +bad3: UNP_PCB_UNLOCK(unp2); UNP_PCB_UNLOCK(unp); bad2: UNP_LINK_WUNLOCK(); bad: if (vp != NULL) vput(vp); free(sa, M_SONAME); UNP_LINK_WLOCK(); UNP_PCB_LOCK(unp); unp->unp_flags &= ~UNP_CONNECTING; UNP_PCB_UNLOCK(unp); return (error); } static int unp_connect2(struct socket *so, struct socket *so2, int req) { struct unpcb *unp; struct unpcb *unp2; unp = sotounpcb(so); KASSERT(unp != NULL, ("unp_connect2: unp == NULL")); unp2 = sotounpcb(so2); KASSERT(unp2 != NULL, ("unp_connect2: unp2 == NULL")); UNP_LINK_WLOCK_ASSERT(); UNP_PCB_LOCK_ASSERT(unp); UNP_PCB_LOCK_ASSERT(unp2); if (so2->so_type != so->so_type) return (EPROTOTYPE); unp2->unp_flags &= ~UNP_NASCENT; unp->unp_conn = unp2; switch (so->so_type) { case SOCK_DGRAM: LIST_INSERT_HEAD(&unp2->unp_refs, unp, unp_reflink); soisconnected(so); break; case SOCK_STREAM: case SOCK_SEQPACKET: unp2->unp_conn = unp; if (req == PRU_CONNECT && ((unp->unp_flags | unp2->unp_flags) & UNP_CONNWAIT)) soisconnecting(so); else soisconnected(so); soisconnected(so2); break; default: panic("unp_connect2"); } return (0); } static void unp_disconnect(struct unpcb *unp, struct unpcb *unp2) { struct socket *so; KASSERT(unp2 != NULL, ("unp_disconnect: unp2 == NULL")); UNP_LINK_WLOCK_ASSERT(); UNP_PCB_LOCK_ASSERT(unp); UNP_PCB_LOCK_ASSERT(unp2); unp->unp_conn = NULL; switch (unp->unp_socket->so_type) { case SOCK_DGRAM: LIST_REMOVE(unp, unp_reflink); so = unp->unp_socket; SOCK_LOCK(so); so->so_state &= ~SS_ISCONNECTED; SOCK_UNLOCK(so); break; case SOCK_STREAM: case SOCK_SEQPACKET: soisdisconnected(unp->unp_socket); unp2->unp_conn = NULL; soisdisconnected(unp2->unp_socket); break; } } /* * unp_pcblist() walks the global list of struct unpcb's to generate a * pointer list, bumping the refcount on each unpcb. It then copies them out * sequentially, validating the generation number on each to see if it has * been detached. All of this is necessary because copyout() may sleep on * disk I/O. */ static int unp_pcblist(SYSCTL_HANDLER_ARGS) { int error, i, n; int freeunp; struct unpcb *unp, **unp_list; unp_gen_t gencnt; struct xunpgen *xug; struct unp_head *head; struct xunpcb *xu; switch ((intptr_t)arg1) { case SOCK_STREAM: head = &unp_shead; break; case SOCK_DGRAM: head = &unp_dhead; break; case SOCK_SEQPACKET: head = &unp_sphead; break; default: panic("unp_pcblist: arg1 %d", (int)(intptr_t)arg1); } /* * The process of preparing the PCB list is too time-consuming and * resource-intensive to repeat twice on every request. */ if (req->oldptr == NULL) { n = unp_count; req->oldidx = 2 * (sizeof *xug) + (n + n/8) * sizeof(struct xunpcb); return (0); } if (req->newptr != NULL) return (EPERM); /* * OK, now we're committed to doing something. */ xug = malloc(sizeof(*xug), M_TEMP, M_WAITOK); - UNP_LIST_LOCK(); + UNP_LINK_RLOCK(); gencnt = unp_gencnt; n = unp_count; - UNP_LIST_UNLOCK(); + UNP_LINK_RUNLOCK(); xug->xug_len = sizeof *xug; xug->xug_count = n; xug->xug_gen = gencnt; xug->xug_sogen = so_gencnt; error = SYSCTL_OUT(req, xug, sizeof *xug); if (error) { free(xug, M_TEMP); return (error); } unp_list = malloc(n * sizeof *unp_list, M_TEMP, M_WAITOK); - UNP_LIST_LOCK(); + UNP_LINK_RLOCK(); for (unp = LIST_FIRST(head), i = 0; unp && i < n; unp = LIST_NEXT(unp, unp_link)) { UNP_PCB_LOCK(unp); if (unp->unp_gencnt <= gencnt) { if (cr_cansee(req->td->td_ucred, unp->unp_socket->so_cred)) { UNP_PCB_UNLOCK(unp); continue; } unp_list[i++] = unp; unp->unp_refcount++; } UNP_PCB_UNLOCK(unp); } - UNP_LIST_UNLOCK(); + UNP_LINK_RUNLOCK(); n = i; /* In case we lost some during malloc. */ error = 0; xu = malloc(sizeof(*xu), M_TEMP, M_WAITOK | M_ZERO); for (i = 0; i < n; i++) { unp = unp_list[i]; UNP_PCB_LOCK(unp); unp->unp_refcount--; if (unp->unp_refcount != 0 && unp->unp_gencnt <= gencnt) { xu->xu_len = sizeof *xu; xu->xu_unpp = unp; /* * XXX - need more locking here to protect against * connect/disconnect races for SMP. */ if (unp->unp_addr != NULL) bcopy(unp->unp_addr, &xu->xu_addr, unp->unp_addr->sun_len); if (unp->unp_conn != NULL && unp->unp_conn->unp_addr != NULL) bcopy(unp->unp_conn->unp_addr, &xu->xu_caddr, unp->unp_conn->unp_addr->sun_len); bcopy(unp, &xu->xu_unp, sizeof *unp); sotoxsocket(unp->unp_socket, &xu->xu_socket); UNP_PCB_UNLOCK(unp); error = SYSCTL_OUT(req, xu, sizeof *xu); } else { freeunp = (unp->unp_refcount == 0); UNP_PCB_UNLOCK(unp); if (freeunp) { UNP_PCB_LOCK_DESTROY(unp); uma_zfree(unp_zone, unp); } } } free(xu, M_TEMP); if (!error) { /* * Give the user an updated idea of our state. If the * generation differs from what we told her before, she knows * that something happened while we were processing this * request, and it might be necessary to retry. */ xug->xug_gen = unp_gencnt; xug->xug_sogen = so_gencnt; xug->xug_count = unp_count; error = SYSCTL_OUT(req, xug, sizeof *xug); } free(unp_list, M_TEMP); free(xug, M_TEMP); return (error); } SYSCTL_PROC(_net_local_dgram, OID_AUTO, pcblist, CTLTYPE_OPAQUE | CTLFLAG_RD, (void *)(intptr_t)SOCK_DGRAM, 0, unp_pcblist, "S,xunpcb", "List of active local datagram sockets"); SYSCTL_PROC(_net_local_stream, OID_AUTO, pcblist, CTLTYPE_OPAQUE | CTLFLAG_RD, (void *)(intptr_t)SOCK_STREAM, 0, unp_pcblist, "S,xunpcb", "List of active local stream sockets"); SYSCTL_PROC(_net_local_seqpacket, OID_AUTO, pcblist, CTLTYPE_OPAQUE | CTLFLAG_RD, (void *)(intptr_t)SOCK_SEQPACKET, 0, unp_pcblist, "S,xunpcb", "List of active local seqpacket sockets"); static void unp_shutdown(struct unpcb *unp) { struct unpcb *unp2; struct socket *so; UNP_LINK_WLOCK_ASSERT(); UNP_PCB_LOCK_ASSERT(unp); unp2 = unp->unp_conn; if ((unp->unp_socket->so_type == SOCK_STREAM || (unp->unp_socket->so_type == SOCK_SEQPACKET)) && unp2 != NULL) { so = unp2->unp_socket; if (so != NULL) socantrcvmore(so); } } static void unp_drop(struct unpcb *unp) { struct socket *so = unp->unp_socket; struct unpcb *unp2; UNP_LINK_WLOCK_ASSERT(); UNP_PCB_LOCK_ASSERT(unp); /* * Regardless of whether the socket's peer dropped the connection * with this socket by aborting or disconnecting, POSIX requires * that ECONNRESET is returned. */ so->so_error = ECONNRESET; unp2 = unp->unp_conn; if (unp2 == NULL) return; UNP_PCB_LOCK(unp2); unp_disconnect(unp, unp2); UNP_PCB_UNLOCK(unp2); } static void unp_freerights(struct filedescent **fdep, int fdcount) { struct file *fp; int i; KASSERT(fdcount > 0, ("%s: fdcount %d", __func__, fdcount)); for (i = 0; i < fdcount; i++) { fp = fdep[i]->fde_file; filecaps_free(&fdep[i]->fde_caps); unp_discard(fp); } free(fdep[0], M_FILECAPS); } static int unp_externalize(struct mbuf *control, struct mbuf **controlp, int flags) { struct thread *td = curthread; /* XXX */ struct cmsghdr *cm = mtod(control, struct cmsghdr *); int i; int *fdp; struct filedesc *fdesc = td->td_proc->p_fd; struct filedescent **fdep; void *data; socklen_t clen = control->m_len, datalen; int error, newfds; u_int newlen; UNP_LINK_UNLOCK_ASSERT(); error = 0; if (controlp != NULL) /* controlp == NULL => free control messages */ *controlp = NULL; while (cm != NULL) { if (sizeof(*cm) > clen || cm->cmsg_len > clen) { error = EINVAL; break; } data = CMSG_DATA(cm); datalen = (caddr_t)cm + cm->cmsg_len - (caddr_t)data; if (cm->cmsg_level == SOL_SOCKET && cm->cmsg_type == SCM_RIGHTS) { newfds = datalen / sizeof(*fdep); if (newfds == 0) goto next; fdep = data; /* If we're not outputting the descriptors free them. */ if (error || controlp == NULL) { unp_freerights(fdep, newfds); goto next; } FILEDESC_XLOCK(fdesc); /* * Now change each pointer to an fd in the global * table to an integer that is the index to the local * fd table entry that we set up to point to the * global one we are transferring. */ newlen = newfds * sizeof(int); *controlp = sbcreatecontrol(NULL, newlen, SCM_RIGHTS, SOL_SOCKET); if (*controlp == NULL) { FILEDESC_XUNLOCK(fdesc); error = E2BIG; unp_freerights(fdep, newfds); goto next; } fdp = (int *) CMSG_DATA(mtod(*controlp, struct cmsghdr *)); if (fdallocn(td, 0, fdp, newfds) != 0) { FILEDESC_XUNLOCK(fdesc); error = EMSGSIZE; unp_freerights(fdep, newfds); m_freem(*controlp); *controlp = NULL; goto next; } for (i = 0; i < newfds; i++, fdp++) { _finstall(fdesc, fdep[i]->fde_file, *fdp, (flags & MSG_CMSG_CLOEXEC) != 0 ? UF_EXCLOSE : 0, &fdep[i]->fde_caps); unp_externalize_fp(fdep[i]->fde_file); } FILEDESC_XUNLOCK(fdesc); free(fdep[0], M_FILECAPS); } else { /* We can just copy anything else across. */ if (error || controlp == NULL) goto next; *controlp = sbcreatecontrol(NULL, datalen, cm->cmsg_type, cm->cmsg_level); if (*controlp == NULL) { error = ENOBUFS; goto next; } bcopy(data, CMSG_DATA(mtod(*controlp, struct cmsghdr *)), datalen); } controlp = &(*controlp)->m_next; next: if (CMSG_SPACE(datalen) < clen) { clen -= CMSG_SPACE(datalen); cm = (struct cmsghdr *) ((caddr_t)cm + CMSG_SPACE(datalen)); } else { clen = 0; cm = NULL; } } m_freem(control); return (error); } static void unp_zone_change(void *tag) { uma_zone_set_max(unp_zone, maxsockets); } static void unp_init(void) { #ifdef VIMAGE if (!IS_DEFAULT_VNET(curvnet)) return; #endif unp_zone = uma_zcreate("unpcb", sizeof(struct unpcb), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); if (unp_zone == NULL) panic("unp_init"); uma_zone_set_max(unp_zone, maxsockets); uma_zone_set_warning(unp_zone, "kern.ipc.maxsockets limit reached"); EVENTHANDLER_REGISTER(maxsockets_change, unp_zone_change, NULL, EVENTHANDLER_PRI_ANY); LIST_INIT(&unp_dhead); LIST_INIT(&unp_shead); LIST_INIT(&unp_sphead); SLIST_INIT(&unp_defers); TIMEOUT_TASK_INIT(taskqueue_thread, &unp_gc_task, 0, unp_gc, NULL); TASK_INIT(&unp_defer_task, 0, unp_process_defers, NULL); UNP_LINK_LOCK_INIT(); - UNP_LIST_LOCK_INIT(); UNP_DEFERRED_LOCK_INIT(); } static int unp_internalize(struct mbuf **controlp, struct thread *td) { struct mbuf *control = *controlp; struct proc *p = td->td_proc; struct filedesc *fdesc = p->p_fd; struct bintime *bt; struct cmsghdr *cm = mtod(control, struct cmsghdr *); struct cmsgcred *cmcred; struct filedescent *fde, **fdep, *fdev; struct file *fp; struct timeval *tv; struct timespec *ts; int i, *fdp; void *data; socklen_t clen = control->m_len, datalen; int error, oldfds; u_int newlen; UNP_LINK_UNLOCK_ASSERT(); error = 0; *controlp = NULL; while (cm != NULL) { if (sizeof(*cm) > clen || cm->cmsg_level != SOL_SOCKET || cm->cmsg_len > clen || cm->cmsg_len < sizeof(*cm)) { error = EINVAL; goto out; } data = CMSG_DATA(cm); datalen = (caddr_t)cm + cm->cmsg_len - (caddr_t)data; switch (cm->cmsg_type) { /* * Fill in credential information. */ case SCM_CREDS: *controlp = sbcreatecontrol(NULL, sizeof(*cmcred), SCM_CREDS, SOL_SOCKET); if (*controlp == NULL) { error = ENOBUFS; goto out; } cmcred = (struct cmsgcred *) CMSG_DATA(mtod(*controlp, struct cmsghdr *)); cmcred->cmcred_pid = p->p_pid; cmcred->cmcred_uid = td->td_ucred->cr_ruid; cmcred->cmcred_gid = td->td_ucred->cr_rgid; cmcred->cmcred_euid = td->td_ucred->cr_uid; cmcred->cmcred_ngroups = MIN(td->td_ucred->cr_ngroups, CMGROUP_MAX); for (i = 0; i < cmcred->cmcred_ngroups; i++) cmcred->cmcred_groups[i] = td->td_ucred->cr_groups[i]; break; case SCM_RIGHTS: oldfds = datalen / sizeof (int); if (oldfds == 0) break; /* * Check that all the FDs passed in refer to legal * files. If not, reject the entire operation. */ fdp = data; FILEDESC_SLOCK(fdesc); for (i = 0; i < oldfds; i++, fdp++) { fp = fget_locked(fdesc, *fdp); if (fp == NULL) { FILEDESC_SUNLOCK(fdesc); error = EBADF; goto out; } if (!(fp->f_ops->fo_flags & DFLAG_PASSABLE)) { FILEDESC_SUNLOCK(fdesc); error = EOPNOTSUPP; goto out; } } /* * Now replace the integer FDs with pointers to the * file structure and capability rights. */ newlen = oldfds * sizeof(fdep[0]); *controlp = sbcreatecontrol(NULL, newlen, SCM_RIGHTS, SOL_SOCKET); if (*controlp == NULL) { FILEDESC_SUNLOCK(fdesc); error = E2BIG; goto out; } fdp = data; fdep = (struct filedescent **) CMSG_DATA(mtod(*controlp, struct cmsghdr *)); fdev = malloc(sizeof(*fdev) * oldfds, M_FILECAPS, M_WAITOK); for (i = 0; i < oldfds; i++, fdev++, fdp++) { fde = &fdesc->fd_ofiles[*fdp]; fdep[i] = fdev; fdep[i]->fde_file = fde->fde_file; filecaps_copy(&fde->fde_caps, &fdep[i]->fde_caps, true); unp_internalize_fp(fdep[i]->fde_file); } FILEDESC_SUNLOCK(fdesc); break; case SCM_TIMESTAMP: *controlp = sbcreatecontrol(NULL, sizeof(*tv), SCM_TIMESTAMP, SOL_SOCKET); if (*controlp == NULL) { error = ENOBUFS; goto out; } tv = (struct timeval *) CMSG_DATA(mtod(*controlp, struct cmsghdr *)); microtime(tv); break; case SCM_BINTIME: *controlp = sbcreatecontrol(NULL, sizeof(*bt), SCM_BINTIME, SOL_SOCKET); if (*controlp == NULL) { error = ENOBUFS; goto out; } bt = (struct bintime *) CMSG_DATA(mtod(*controlp, struct cmsghdr *)); bintime(bt); break; case SCM_REALTIME: *controlp = sbcreatecontrol(NULL, sizeof(*ts), SCM_REALTIME, SOL_SOCKET); if (*controlp == NULL) { error = ENOBUFS; goto out; } ts = (struct timespec *) CMSG_DATA(mtod(*controlp, struct cmsghdr *)); nanotime(ts); break; case SCM_MONOTONIC: *controlp = sbcreatecontrol(NULL, sizeof(*ts), SCM_MONOTONIC, SOL_SOCKET); if (*controlp == NULL) { error = ENOBUFS; goto out; } ts = (struct timespec *) CMSG_DATA(mtod(*controlp, struct cmsghdr *)); nanouptime(ts); break; default: error = EINVAL; goto out; } controlp = &(*controlp)->m_next; if (CMSG_SPACE(datalen) < clen) { clen -= CMSG_SPACE(datalen); cm = (struct cmsghdr *) ((caddr_t)cm + CMSG_SPACE(datalen)); } else { clen = 0; cm = NULL; } } out: m_freem(control); return (error); } static struct mbuf * unp_addsockcred(struct thread *td, struct mbuf *control) { struct mbuf *m, *n, *n_prev; struct sockcred *sc; const struct cmsghdr *cm; int ngroups; int i; ngroups = MIN(td->td_ucred->cr_ngroups, CMGROUP_MAX); m = sbcreatecontrol(NULL, SOCKCREDSIZE(ngroups), SCM_CREDS, SOL_SOCKET); if (m == NULL) return (control); sc = (struct sockcred *) CMSG_DATA(mtod(m, struct cmsghdr *)); sc->sc_uid = td->td_ucred->cr_ruid; sc->sc_euid = td->td_ucred->cr_uid; sc->sc_gid = td->td_ucred->cr_rgid; sc->sc_egid = td->td_ucred->cr_gid; sc->sc_ngroups = ngroups; for (i = 0; i < sc->sc_ngroups; i++) sc->sc_groups[i] = td->td_ucred->cr_groups[i]; /* * Unlink SCM_CREDS control messages (struct cmsgcred), since just * created SCM_CREDS control message (struct sockcred) has another * format. */ if (control != NULL) for (n = control, n_prev = NULL; n != NULL;) { cm = mtod(n, struct cmsghdr *); if (cm->cmsg_level == SOL_SOCKET && cm->cmsg_type == SCM_CREDS) { if (n_prev == NULL) control = n->m_next; else n_prev->m_next = n->m_next; n = m_free(n); } else { n_prev = n; n = n->m_next; } } /* Prepend it to the head. */ m->m_next = control; return (m); } static struct unpcb * fptounp(struct file *fp) { struct socket *so; if (fp->f_type != DTYPE_SOCKET) return (NULL); if ((so = fp->f_data) == NULL) return (NULL); if (so->so_proto->pr_domain != &localdomain) return (NULL); return sotounpcb(so); } static void unp_discard(struct file *fp) { struct unp_defer *dr; if (unp_externalize_fp(fp)) { dr = malloc(sizeof(*dr), M_TEMP, M_WAITOK); dr->ud_fp = fp; UNP_DEFERRED_LOCK(); SLIST_INSERT_HEAD(&unp_defers, dr, ud_link); UNP_DEFERRED_UNLOCK(); atomic_add_int(&unp_defers_count, 1); taskqueue_enqueue(taskqueue_thread, &unp_defer_task); } else (void) closef(fp, (struct thread *)NULL); } static void unp_process_defers(void *arg __unused, int pending) { struct unp_defer *dr; SLIST_HEAD(, unp_defer) drl; int count; SLIST_INIT(&drl); for (;;) { UNP_DEFERRED_LOCK(); if (SLIST_FIRST(&unp_defers) == NULL) { UNP_DEFERRED_UNLOCK(); break; } SLIST_SWAP(&unp_defers, &drl, unp_defer); UNP_DEFERRED_UNLOCK(); count = 0; while ((dr = SLIST_FIRST(&drl)) != NULL) { SLIST_REMOVE_HEAD(&drl, ud_link); closef(dr->ud_fp, NULL); free(dr, M_TEMP); count++; } atomic_add_int(&unp_defers_count, -count); } } static void unp_internalize_fp(struct file *fp) { struct unpcb *unp; UNP_LINK_WLOCK(); if ((unp = fptounp(fp)) != NULL) { unp->unp_file = fp; unp->unp_msgcount++; } fhold(fp); unp_rights++; UNP_LINK_WUNLOCK(); } static int unp_externalize_fp(struct file *fp) { struct unpcb *unp; int ret; UNP_LINK_WLOCK(); if ((unp = fptounp(fp)) != NULL) { unp->unp_msgcount--; ret = 1; } else ret = 0; unp_rights--; UNP_LINK_WUNLOCK(); return (ret); } /* * unp_defer indicates whether additional work has been defered for a future * pass through unp_gc(). It is thread local and does not require explicit * synchronization. */ static int unp_marked; static int unp_unreachable; static void unp_accessable(struct filedescent **fdep, int fdcount) { struct unpcb *unp; struct file *fp; int i; for (i = 0; i < fdcount; i++) { fp = fdep[i]->fde_file; if ((unp = fptounp(fp)) == NULL) continue; if (unp->unp_gcflag & UNPGC_REF) continue; unp->unp_gcflag &= ~UNPGC_DEAD; unp->unp_gcflag |= UNPGC_REF; unp_marked++; } } static void unp_gc_process(struct unpcb *unp) { - struct socket *soa; - struct socket *so; + struct socket *so, *soa; struct file *fp; /* Already processed. */ if (unp->unp_gcflag & UNPGC_SCANNED) return; fp = unp->unp_file; /* * Check for a socket potentially in a cycle. It must be in a * queue as indicated by msgcount, and this must equal the file * reference count. Note that when msgcount is 0 the file is NULL. */ if ((unp->unp_gcflag & UNPGC_REF) == 0 && fp && unp->unp_msgcount != 0 && fp->f_count == unp->unp_msgcount) { unp->unp_gcflag |= UNPGC_DEAD; unp_unreachable++; return; } - /* - * Mark all sockets we reference with RIGHTS. - */ so = unp->unp_socket; - if ((unp->unp_gcflag & UNPGC_IGNORE_RIGHTS) == 0) { - SOCKBUF_LOCK(&so->so_rcv); - unp_scan(so->so_rcv.sb_mb, unp_accessable); - SOCKBUF_UNLOCK(&so->so_rcv); + SOCK_LOCK(so); + if (SOLISTENING(so)) { + /* + * Mark all sockets in our accept queue. + */ + TAILQ_FOREACH(soa, &so->sol_comp, so_list) { + if (sotounpcb(soa)->unp_gcflag & UNPGC_IGNORE_RIGHTS) + continue; + SOCKBUF_LOCK(&soa->so_rcv); + unp_scan(soa->so_rcv.sb_mb, unp_accessable); + SOCKBUF_UNLOCK(&soa->so_rcv); + } + } else { + /* + * Mark all sockets we reference with RIGHTS. + */ + if ((unp->unp_gcflag & UNPGC_IGNORE_RIGHTS) == 0) { + SOCKBUF_LOCK(&so->so_rcv); + unp_scan(so->so_rcv.sb_mb, unp_accessable); + SOCKBUF_UNLOCK(&so->so_rcv); + } } - - /* - * Mark all sockets in our accept queue. - */ - ACCEPT_LOCK(); - TAILQ_FOREACH(soa, &so->so_comp, so_list) { - if ((sotounpcb(soa)->unp_gcflag & UNPGC_IGNORE_RIGHTS) != 0) - continue; - SOCKBUF_LOCK(&soa->so_rcv); - unp_scan(soa->so_rcv.sb_mb, unp_accessable); - SOCKBUF_UNLOCK(&soa->so_rcv); - } - ACCEPT_UNLOCK(); + SOCK_UNLOCK(so); unp->unp_gcflag |= UNPGC_SCANNED; } static int unp_recycled; SYSCTL_INT(_net_local, OID_AUTO, recycled, CTLFLAG_RD, &unp_recycled, 0, "Number of unreachable sockets claimed by the garbage collector."); static int unp_taskcount; SYSCTL_INT(_net_local, OID_AUTO, taskcount, CTLFLAG_RD, &unp_taskcount, 0, "Number of times the garbage collector has run."); static void unp_gc(__unused void *arg, int pending) { struct unp_head *heads[] = { &unp_dhead, &unp_shead, &unp_sphead, NULL }; struct unp_head **head; struct file *f, **unref; struct unpcb *unp; int i, total; unp_taskcount++; - UNP_LIST_LOCK(); + UNP_LINK_RLOCK(); /* * First clear all gc flags from previous runs, apart from * UNPGC_IGNORE_RIGHTS. */ for (head = heads; *head != NULL; head++) LIST_FOREACH(unp, *head, unp_link) unp->unp_gcflag = (unp->unp_gcflag & UNPGC_IGNORE_RIGHTS); /* * Scan marking all reachable sockets with UNPGC_REF. Once a socket * is reachable all of the sockets it references are reachable. * Stop the scan once we do a complete loop without discovering * a new reachable socket. */ do { unp_unreachable = 0; unp_marked = 0; for (head = heads; *head != NULL; head++) LIST_FOREACH(unp, *head, unp_link) unp_gc_process(unp); } while (unp_marked); - UNP_LIST_UNLOCK(); + UNP_LINK_RUNLOCK(); if (unp_unreachable == 0) return; /* * Allocate space for a local list of dead unpcbs. */ unref = malloc(unp_unreachable * sizeof(struct file *), M_TEMP, M_WAITOK); /* * Iterate looking for sockets which have been specifically marked * as as unreachable and store them locally. */ UNP_LINK_RLOCK(); - UNP_LIST_LOCK(); for (total = 0, head = heads; *head != NULL; head++) LIST_FOREACH(unp, *head, unp_link) if ((unp->unp_gcflag & UNPGC_DEAD) != 0) { f = unp->unp_file; if (unp->unp_msgcount == 0 || f == NULL || f->f_count != unp->unp_msgcount) continue; unref[total++] = f; fhold(f); KASSERT(total <= unp_unreachable, ("unp_gc: incorrect unreachable count.")); } - UNP_LIST_UNLOCK(); UNP_LINK_RUNLOCK(); /* * Now flush all sockets, free'ing rights. This will free the * struct files associated with these sockets but leave each socket * with one remaining ref. */ for (i = 0; i < total; i++) { struct socket *so; so = unref[i]->f_data; CURVNET_SET(so->so_vnet); sorflush(so); CURVNET_RESTORE(); } /* * And finally release the sockets so they can be reclaimed. */ for (i = 0; i < total; i++) fdrop(unref[i], NULL); unp_recycled += total; free(unref, M_TEMP); } static void unp_dispose_mbuf(struct mbuf *m) { if (m) unp_scan(m, unp_freerights); } /* * Synchronize against unp_gc, which can trip over data as we are freeing it. */ static void unp_dispose(struct socket *so) { struct unpcb *unp; unp = sotounpcb(so); - UNP_LIST_LOCK(); + UNP_LINK_WLOCK(); unp->unp_gcflag |= UNPGC_IGNORE_RIGHTS; - UNP_LIST_UNLOCK(); - unp_dispose_mbuf(so->so_rcv.sb_mb); + UNP_LINK_WUNLOCK(); + if (!SOLISTENING(so)) + unp_dispose_mbuf(so->so_rcv.sb_mb); } static void unp_scan(struct mbuf *m0, void (*op)(struct filedescent **, int)) { struct mbuf *m; struct cmsghdr *cm; void *data; socklen_t clen, datalen; while (m0 != NULL) { for (m = m0; m; m = m->m_next) { if (m->m_type != MT_CONTROL) continue; cm = mtod(m, struct cmsghdr *); clen = m->m_len; while (cm != NULL) { if (sizeof(*cm) > clen || cm->cmsg_len > clen) break; data = CMSG_DATA(cm); datalen = (caddr_t)cm + cm->cmsg_len - (caddr_t)data; if (cm->cmsg_level == SOL_SOCKET && cm->cmsg_type == SCM_RIGHTS) { (*op)(data, datalen / sizeof(struct filedescent *)); } if (CMSG_SPACE(datalen) < clen) { clen -= CMSG_SPACE(datalen); cm = (struct cmsghdr *) ((caddr_t)cm + CMSG_SPACE(datalen)); } else { clen = 0; cm = NULL; } } } m0 = m0->m_nextpkt; } } /* * A helper function called by VFS before socket-type vnode reclamation. * For an active vnode it clears unp_vnode pointer and decrements unp_vnode * use count. */ void vfs_unp_reclaim(struct vnode *vp) { struct unpcb *unp; int active; ASSERT_VOP_ELOCKED(vp, "vfs_unp_reclaim"); KASSERT(vp->v_type == VSOCK, ("vfs_unp_reclaim: vp->v_type != VSOCK")); active = 0; UNP_LINK_WLOCK(); VOP_UNP_CONNECT(vp, &unp); if (unp == NULL) goto done; UNP_PCB_LOCK(unp); if (unp->unp_vnode == vp) { VOP_UNP_DETACH(vp); unp->unp_vnode = NULL; active = 1; } UNP_PCB_UNLOCK(unp); done: UNP_LINK_WUNLOCK(); if (active) vunref(vp); } #ifdef DDB static void db_print_indent(int indent) { int i; for (i = 0; i < indent; i++) db_printf(" "); } static void db_print_unpflags(int unp_flags) { int comma; comma = 0; if (unp_flags & UNP_HAVEPC) { db_printf("%sUNP_HAVEPC", comma ? ", " : ""); comma = 1; } if (unp_flags & UNP_WANTCRED) { db_printf("%sUNP_WANTCRED", comma ? ", " : ""); comma = 1; } if (unp_flags & UNP_CONNWAIT) { db_printf("%sUNP_CONNWAIT", comma ? ", " : ""); comma = 1; } if (unp_flags & UNP_CONNECTING) { db_printf("%sUNP_CONNECTING", comma ? ", " : ""); comma = 1; } if (unp_flags & UNP_BINDING) { db_printf("%sUNP_BINDING", comma ? ", " : ""); comma = 1; } } static void db_print_xucred(int indent, struct xucred *xu) { int comma, i; db_print_indent(indent); db_printf("cr_version: %u cr_uid: %u cr_ngroups: %d\n", xu->cr_version, xu->cr_uid, xu->cr_ngroups); db_print_indent(indent); db_printf("cr_groups: "); comma = 0; for (i = 0; i < xu->cr_ngroups; i++) { db_printf("%s%u", comma ? ", " : "", xu->cr_groups[i]); comma = 1; } db_printf("\n"); } static void db_print_unprefs(int indent, struct unp_head *uh) { struct unpcb *unp; int counter; counter = 0; LIST_FOREACH(unp, uh, unp_reflink) { if (counter % 4 == 0) db_print_indent(indent); db_printf("%p ", unp); if (counter % 4 == 3) db_printf("\n"); counter++; } if (counter != 0 && counter % 4 != 0) db_printf("\n"); } DB_SHOW_COMMAND(unpcb, db_show_unpcb) { struct unpcb *unp; if (!have_addr) { db_printf("usage: show unpcb \n"); return; } unp = (struct unpcb *)addr; db_printf("unp_socket: %p unp_vnode: %p\n", unp->unp_socket, unp->unp_vnode); db_printf("unp_ino: %ju unp_conn: %p\n", (uintmax_t)unp->unp_ino, unp->unp_conn); db_printf("unp_refs:\n"); db_print_unprefs(2, &unp->unp_refs); /* XXXRW: Would be nice to print the full address, if any. */ db_printf("unp_addr: %p\n", unp->unp_addr); db_printf("unp_gencnt: %llu\n", (unsigned long long)unp->unp_gencnt); db_printf("unp_flags: %x (", unp->unp_flags); db_print_unpflags(unp->unp_flags); db_printf(")\n"); db_printf("unp_peercred:\n"); db_print_xucred(2, &unp->unp_peercred); db_printf("unp_refcount: %u\n", unp->unp_refcount); } #endif Index: head/sys/netgraph/bluetooth/socket/ng_btsocket_l2cap.c =================================================================== --- head/sys/netgraph/bluetooth/socket/ng_btsocket_l2cap.c (revision 319721) +++ head/sys/netgraph/bluetooth/socket/ng_btsocket_l2cap.c (revision 319722) @@ -1,2979 +1,2971 @@ /* * ng_btsocket_l2cap.c */ /*- * Copyright (c) 2001-2002 Maksim Yevmenkin * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $Id: ng_btsocket_l2cap.c,v 1.16 2003/09/14 23:29:06 max Exp $ * $FreeBSD$ */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* MALLOC define */ #ifdef NG_SEPARATE_MALLOC static MALLOC_DEFINE(M_NETGRAPH_BTSOCKET_L2CAP, "netgraph_btsocks_l2cap", "Netgraph Bluetooth L2CAP sockets"); #else #define M_NETGRAPH_BTSOCKET_L2CAP M_NETGRAPH #endif /* NG_SEPARATE_MALLOC */ /* Netgraph node methods */ static ng_constructor_t ng_btsocket_l2cap_node_constructor; static ng_rcvmsg_t ng_btsocket_l2cap_node_rcvmsg; static ng_shutdown_t ng_btsocket_l2cap_node_shutdown; static ng_newhook_t ng_btsocket_l2cap_node_newhook; static ng_connect_t ng_btsocket_l2cap_node_connect; static ng_rcvdata_t ng_btsocket_l2cap_node_rcvdata; static ng_disconnect_t ng_btsocket_l2cap_node_disconnect; static void ng_btsocket_l2cap_input (void *, int); static void ng_btsocket_l2cap_rtclean (void *, int); /* Netgraph type descriptor */ static struct ng_type typestruct = { .version = NG_ABI_VERSION, .name = NG_BTSOCKET_L2CAP_NODE_TYPE, .constructor = ng_btsocket_l2cap_node_constructor, .rcvmsg = ng_btsocket_l2cap_node_rcvmsg, .shutdown = ng_btsocket_l2cap_node_shutdown, .newhook = ng_btsocket_l2cap_node_newhook, .connect = ng_btsocket_l2cap_node_connect, .rcvdata = ng_btsocket_l2cap_node_rcvdata, .disconnect = ng_btsocket_l2cap_node_disconnect, }; /* Globals */ extern int ifqmaxlen; static u_int32_t ng_btsocket_l2cap_debug_level; static node_p ng_btsocket_l2cap_node; static struct ng_bt_itemq ng_btsocket_l2cap_queue; static struct mtx ng_btsocket_l2cap_queue_mtx; static struct task ng_btsocket_l2cap_queue_task; static LIST_HEAD(, ng_btsocket_l2cap_pcb) ng_btsocket_l2cap_sockets; static struct mtx ng_btsocket_l2cap_sockets_mtx; static LIST_HEAD(, ng_btsocket_l2cap_rtentry) ng_btsocket_l2cap_rt; static struct mtx ng_btsocket_l2cap_rt_mtx; static struct task ng_btsocket_l2cap_rt_task; static struct timeval ng_btsocket_l2cap_lasttime; static int ng_btsocket_l2cap_curpps; /* Sysctl tree */ SYSCTL_DECL(_net_bluetooth_l2cap_sockets); static SYSCTL_NODE(_net_bluetooth_l2cap_sockets, OID_AUTO, seq, CTLFLAG_RW, 0, "Bluetooth SEQPACKET L2CAP sockets family"); SYSCTL_UINT(_net_bluetooth_l2cap_sockets_seq, OID_AUTO, debug_level, CTLFLAG_RW, &ng_btsocket_l2cap_debug_level, NG_BTSOCKET_WARN_LEVEL, "Bluetooth SEQPACKET L2CAP sockets debug level"); SYSCTL_UINT(_net_bluetooth_l2cap_sockets_seq, OID_AUTO, queue_len, CTLFLAG_RD, &ng_btsocket_l2cap_queue.len, 0, "Bluetooth SEQPACKET L2CAP sockets input queue length"); SYSCTL_UINT(_net_bluetooth_l2cap_sockets_seq, OID_AUTO, queue_maxlen, CTLFLAG_RD, &ng_btsocket_l2cap_queue.maxlen, 0, "Bluetooth SEQPACKET L2CAP sockets input queue max. length"); SYSCTL_UINT(_net_bluetooth_l2cap_sockets_seq, OID_AUTO, queue_drops, CTLFLAG_RD, &ng_btsocket_l2cap_queue.drops, 0, "Bluetooth SEQPACKET L2CAP sockets input queue drops"); /* Debug */ #define NG_BTSOCKET_L2CAP_INFO \ if (ng_btsocket_l2cap_debug_level >= NG_BTSOCKET_INFO_LEVEL && \ ppsratecheck(&ng_btsocket_l2cap_lasttime, &ng_btsocket_l2cap_curpps, 1)) \ printf #define NG_BTSOCKET_L2CAP_WARN \ if (ng_btsocket_l2cap_debug_level >= NG_BTSOCKET_WARN_LEVEL && \ ppsratecheck(&ng_btsocket_l2cap_lasttime, &ng_btsocket_l2cap_curpps, 1)) \ printf #define NG_BTSOCKET_L2CAP_ERR \ if (ng_btsocket_l2cap_debug_level >= NG_BTSOCKET_ERR_LEVEL && \ ppsratecheck(&ng_btsocket_l2cap_lasttime, &ng_btsocket_l2cap_curpps, 1)) \ printf #define NG_BTSOCKET_L2CAP_ALERT \ if (ng_btsocket_l2cap_debug_level >= NG_BTSOCKET_ALERT_LEVEL && \ ppsratecheck(&ng_btsocket_l2cap_lasttime, &ng_btsocket_l2cap_curpps, 1)) \ printf /* * Netgraph message processing routines */ static int ng_btsocket_l2cap_process_l2ca_con_req_rsp (struct ng_mesg *, ng_btsocket_l2cap_rtentry_p); static int ng_btsocket_l2cap_process_l2ca_con_rsp_rsp (struct ng_mesg *, ng_btsocket_l2cap_rtentry_p); static int ng_btsocket_l2cap_process_l2ca_con_ind (struct ng_mesg *, ng_btsocket_l2cap_rtentry_p); static int ng_btsocket_l2cap_process_l2ca_cfg_req_rsp (struct ng_mesg *, ng_btsocket_l2cap_rtentry_p); static int ng_btsocket_l2cap_process_l2ca_cfg_rsp_rsp (struct ng_mesg *, ng_btsocket_l2cap_rtentry_p); static int ng_btsocket_l2cap_process_l2ca_cfg_ind (struct ng_mesg *, ng_btsocket_l2cap_rtentry_p); static int ng_btsocket_l2cap_process_l2ca_discon_rsp (struct ng_mesg *, ng_btsocket_l2cap_rtentry_p); static int ng_btsocket_l2cap_process_l2ca_discon_ind (struct ng_mesg *, ng_btsocket_l2cap_rtentry_p); static int ng_btsocket_l2cap_process_l2ca_write_rsp (struct ng_mesg *, ng_btsocket_l2cap_rtentry_p); /* * Send L2CA_xxx messages to the lower layer */ static int ng_btsocket_l2cap_send_l2ca_con_req (ng_btsocket_l2cap_pcb_p); static int ng_btsocket_l2cap_send_l2ca_con_rsp_req (u_int32_t, ng_btsocket_l2cap_rtentry_p, bdaddr_p, int, int, int, int); static int ng_btsocket_l2cap_send_l2ca_cfg_req (ng_btsocket_l2cap_pcb_p); static int ng_btsocket_l2cap_send_l2ca_cfg_rsp (ng_btsocket_l2cap_pcb_p); static int ng_btsocket_l2cap_send_l2ca_discon_req (u_int32_t, ng_btsocket_l2cap_pcb_p); static int ng_btsocket_l2cap_send2 (ng_btsocket_l2cap_pcb_p); /* * Timeout processing routines */ static void ng_btsocket_l2cap_timeout (ng_btsocket_l2cap_pcb_p); static void ng_btsocket_l2cap_untimeout (ng_btsocket_l2cap_pcb_p); static void ng_btsocket_l2cap_process_timeout (void *); /* * Other stuff */ static ng_btsocket_l2cap_pcb_p ng_btsocket_l2cap_pcb_by_addr(bdaddr_p, int); static ng_btsocket_l2cap_pcb_p ng_btsocket_l2cap_pcb_by_token(u_int32_t); static ng_btsocket_l2cap_pcb_p ng_btsocket_l2cap_pcb_by_cid (bdaddr_p, int,int); static int ng_btsocket_l2cap_result2errno(int); static int ng_btsock_l2cap_addrtype_to_linktype(int addrtype); #define ng_btsocket_l2cap_wakeup_input_task() \ taskqueue_enqueue(taskqueue_swi_giant, &ng_btsocket_l2cap_queue_task) #define ng_btsocket_l2cap_wakeup_route_task() \ taskqueue_enqueue(taskqueue_swi_giant, &ng_btsocket_l2cap_rt_task) int ng_btsock_l2cap_addrtype_to_linktype(int addrtype) { switch(addrtype){ case BDADDR_LE_PUBLIC: return NG_HCI_LINK_LE_PUBLIC; case BDADDR_LE_RANDOM: return NG_HCI_LINK_LE_RANDOM; default: return NG_HCI_LINK_ACL; } } /***************************************************************************** ***************************************************************************** ** Netgraph node interface ***************************************************************************** *****************************************************************************/ /* * Netgraph node constructor. Do not allow to create node of this type. */ static int ng_btsocket_l2cap_node_constructor(node_p node) { return (EINVAL); } /* ng_btsocket_l2cap_node_constructor */ /* * Do local shutdown processing. Let old node go and create new fresh one. */ static int ng_btsocket_l2cap_node_shutdown(node_p node) { int error = 0; NG_NODE_UNREF(node); /* Create new node */ error = ng_make_node_common(&typestruct, &ng_btsocket_l2cap_node); if (error != 0) { NG_BTSOCKET_L2CAP_ALERT( "%s: Could not create Netgraph node, error=%d\n", __func__, error); ng_btsocket_l2cap_node = NULL; return (error); } error = ng_name_node(ng_btsocket_l2cap_node, NG_BTSOCKET_L2CAP_NODE_TYPE); if (error != 0) { NG_BTSOCKET_L2CAP_ALERT( "%s: Could not name Netgraph node, error=%d\n", __func__, error); NG_NODE_UNREF(ng_btsocket_l2cap_node); ng_btsocket_l2cap_node = NULL; return (error); } return (0); } /* ng_btsocket_l2cap_node_shutdown */ /* * We allow any hook to be connected to the node. */ static int ng_btsocket_l2cap_node_newhook(node_p node, hook_p hook, char const *name) { return (0); } /* ng_btsocket_l2cap_node_newhook */ /* * Just say "YEP, that's OK by me!" */ static int ng_btsocket_l2cap_node_connect(hook_p hook) { NG_HOOK_SET_PRIVATE(hook, NULL); NG_HOOK_REF(hook); /* Keep extra reference to the hook */ #if 0 NG_HOOK_FORCE_QUEUE(NG_HOOK_PEER(hook)); NG_HOOK_FORCE_QUEUE(hook); #endif return (0); } /* ng_btsocket_l2cap_node_connect */ /* * Hook disconnection. Schedule route cleanup task */ static int ng_btsocket_l2cap_node_disconnect(hook_p hook) { /* * If hook has private information than we must have this hook in * the routing table and must schedule cleaning for the routing table. * Otherwise hook was connected but we never got "hook_info" message, * so we have never added this hook to the routing table and it save * to just delete it. */ if (NG_HOOK_PRIVATE(hook) != NULL) return (ng_btsocket_l2cap_wakeup_route_task()); NG_HOOK_UNREF(hook); /* Remove extra reference */ return (0); } /* ng_btsocket_l2cap_node_disconnect */ /* * Process incoming messages */ static int ng_btsocket_l2cap_node_rcvmsg(node_p node, item_p item, hook_p hook) { struct ng_mesg *msg = NGI_MSG(item); /* item still has message */ int error = 0; if (msg != NULL && msg->header.typecookie == NGM_L2CAP_COOKIE) { mtx_lock(&ng_btsocket_l2cap_queue_mtx); if (NG_BT_ITEMQ_FULL(&ng_btsocket_l2cap_queue)) { NG_BTSOCKET_L2CAP_ERR( "%s: Input queue is full (msg)\n", __func__); NG_BT_ITEMQ_DROP(&ng_btsocket_l2cap_queue); NG_FREE_ITEM(item); error = ENOBUFS; } else { if (hook != NULL) { NG_HOOK_REF(hook); NGI_SET_HOOK(item, hook); } NG_BT_ITEMQ_ENQUEUE(&ng_btsocket_l2cap_queue, item); error = ng_btsocket_l2cap_wakeup_input_task(); } mtx_unlock(&ng_btsocket_l2cap_queue_mtx); } else { NG_FREE_ITEM(item); error = EINVAL; } return (error); } /* ng_btsocket_l2cap_node_rcvmsg */ /* * Receive data on a hook */ static int ng_btsocket_l2cap_node_rcvdata(hook_p hook, item_p item) { int error = 0; mtx_lock(&ng_btsocket_l2cap_queue_mtx); if (NG_BT_ITEMQ_FULL(&ng_btsocket_l2cap_queue)) { NG_BTSOCKET_L2CAP_ERR( "%s: Input queue is full (data)\n", __func__); NG_BT_ITEMQ_DROP(&ng_btsocket_l2cap_queue); NG_FREE_ITEM(item); error = ENOBUFS; } else { NG_HOOK_REF(hook); NGI_SET_HOOK(item, hook); NG_BT_ITEMQ_ENQUEUE(&ng_btsocket_l2cap_queue, item); error = ng_btsocket_l2cap_wakeup_input_task(); } mtx_unlock(&ng_btsocket_l2cap_queue_mtx); return (error); } /* ng_btsocket_l2cap_node_rcvdata */ /* * Process L2CA_Connect respose. Socket layer must have initiated connection, * so we have to have a socket associated with message token. */ static int ng_btsocket_l2cap_process_l2ca_con_req_rsp(struct ng_mesg *msg, ng_btsocket_l2cap_rtentry_p rt) { ng_l2cap_l2ca_con_op *op = NULL; ng_btsocket_l2cap_pcb_t *pcb = NULL; int error = 0; if (msg->header.arglen != sizeof(*op)) return (EMSGSIZE); op = (ng_l2cap_l2ca_con_op *)(msg->data); mtx_lock(&ng_btsocket_l2cap_sockets_mtx); /* Look for the socket with the token */ pcb = ng_btsocket_l2cap_pcb_by_token(msg->header.token); if (pcb == NULL) { mtx_unlock(&ng_btsocket_l2cap_sockets_mtx); return (ENOENT); } mtx_lock(&pcb->pcb_mtx); NG_BTSOCKET_L2CAP_INFO( "%s: Got L2CA_Connect response, token=%d, src bdaddr=%x:%x:%x:%x:%x:%x, " \ "dst bdaddr=%x:%x:%x:%x:%x:%x, psm=%d, lcid=%d, result=%d, status=%d, " \ "state=%d\n", __func__, msg->header.token, pcb->src.b[5], pcb->src.b[4], pcb->src.b[3], pcb->src.b[2], pcb->src.b[1], pcb->src.b[0], pcb->dst.b[5], pcb->dst.b[4], pcb->dst.b[3], pcb->dst.b[2], pcb->dst.b[1], pcb->dst.b[0], pcb->psm, op->lcid, op->result, op->status, pcb->state); if (pcb->state != NG_BTSOCKET_L2CAP_CONNECTING) { mtx_unlock(&pcb->pcb_mtx); mtx_unlock(&ng_btsocket_l2cap_sockets_mtx); return (ENOENT); } ng_btsocket_l2cap_untimeout(pcb); if (op->result == NG_L2CAP_PENDING) { ng_btsocket_l2cap_timeout(pcb); mtx_unlock(&pcb->pcb_mtx); mtx_unlock(&ng_btsocket_l2cap_sockets_mtx); return (0); } if (op->result == NG_L2CAP_SUCCESS){ if((pcb->idtype == NG_L2CAP_L2CA_IDTYPE_ATT)|| (pcb->idtype == NG_L2CAP_L2CA_IDTYPE_SMP)){ pcb->encryption = op->encryption; pcb->cid = op->lcid; if(pcb->need_encrypt && !(pcb->encryption)){ ng_btsocket_l2cap_timeout(pcb); pcb->state = NG_BTSOCKET_L2CAP_W4_ENC_CHANGE; }else{ pcb->state = NG_BTSOCKET_L2CAP_OPEN; soisconnected(pcb->so); } }else{ /* * Channel is now open, so update local channel ID and * start configuration process. Source and destination * addresses as well as route must be already set. */ pcb->cid = op->lcid; pcb->encryption = op->encryption; error = ng_btsocket_l2cap_send_l2ca_cfg_req(pcb); if (error != 0) { /* Send disconnect request with "zero" token */ ng_btsocket_l2cap_send_l2ca_discon_req(0, pcb); /* ... and close the socket */ pcb->state = NG_BTSOCKET_L2CAP_CLOSED; soisdisconnected(pcb->so); } else { pcb->cfg_state = NG_BTSOCKET_L2CAP_CFG_IN_SENT; pcb->state = NG_BTSOCKET_L2CAP_CONFIGURING; ng_btsocket_l2cap_timeout(pcb); } } } else { /* * We have failed to open connection, so convert result * code to "errno" code and disconnect the socket. Channel * already has been closed. */ pcb->so->so_error = ng_btsocket_l2cap_result2errno(op->result); pcb->state = NG_BTSOCKET_L2CAP_CLOSED; soisdisconnected(pcb->so); } mtx_unlock(&pcb->pcb_mtx); mtx_unlock(&ng_btsocket_l2cap_sockets_mtx); return (error); } /* ng_btsocket_l2cap_process_l2ca_con_req_rsp */ /* * Process L2CA_ConnectRsp response */ static int ng_btsocket_l2cap_process_l2ca_con_rsp_rsp(struct ng_mesg *msg, ng_btsocket_l2cap_rtentry_p rt) { ng_l2cap_l2ca_con_rsp_op *op = NULL; ng_btsocket_l2cap_pcb_t *pcb = NULL; if (msg->header.arglen != sizeof(*op)) return (EMSGSIZE); op = (ng_l2cap_l2ca_con_rsp_op *)(msg->data); mtx_lock(&ng_btsocket_l2cap_sockets_mtx); /* Look for the socket with the token */ pcb = ng_btsocket_l2cap_pcb_by_token(msg->header.token); if (pcb == NULL) { mtx_unlock(&ng_btsocket_l2cap_sockets_mtx); return (ENOENT); } mtx_lock(&pcb->pcb_mtx); NG_BTSOCKET_L2CAP_INFO( "%s: Got L2CA_ConnectRsp response, token=%d, src bdaddr=%x:%x:%x:%x:%x:%x, " \ "dst bdaddr=%x:%x:%x:%x:%x:%x, psm=%d, lcid=%d, result=%d, state=%d\n", __func__, msg->header.token, pcb->src.b[5], pcb->src.b[4], pcb->src.b[3], pcb->src.b[2], pcb->src.b[1], pcb->src.b[0], pcb->dst.b[5], pcb->dst.b[4], pcb->dst.b[3], pcb->dst.b[2], pcb->dst.b[1], pcb->dst.b[0], pcb->psm, pcb->cid, op->result, pcb->state); if (pcb->state != NG_BTSOCKET_L2CAP_CONNECTING) { mtx_unlock(&pcb->pcb_mtx); mtx_unlock(&ng_btsocket_l2cap_sockets_mtx); return (ENOENT); } ng_btsocket_l2cap_untimeout(pcb); /* Check the result and disconnect the socket on failure */ if (op->result != NG_L2CAP_SUCCESS) { /* Close the socket - channel already closed */ pcb->so->so_error = ng_btsocket_l2cap_result2errno(op->result); pcb->state = NG_BTSOCKET_L2CAP_CLOSED; soisdisconnected(pcb->so); } else { /* Move to CONFIGURING state and wait for CONFIG_IND */ pcb->cfg_state = 0; pcb->state = NG_BTSOCKET_L2CAP_CONFIGURING; ng_btsocket_l2cap_timeout(pcb); } mtx_unlock(&pcb->pcb_mtx); mtx_unlock(&ng_btsocket_l2cap_sockets_mtx); return (0); } /* ng_btsocket_process_l2ca_con_rsp_rsp */ /* * Process L2CA_Connect indicator. Find socket that listens on address * and PSM. Find exact or closest match. Create new socket and initiate * connection. */ static int ng_btsocket_l2cap_process_l2ca_con_ind(struct ng_mesg *msg, ng_btsocket_l2cap_rtentry_p rt) { ng_l2cap_l2ca_con_ind_ip *ip = NULL; ng_btsocket_l2cap_pcb_t *pcb = NULL, *pcb1 = NULL; int error = 0; u_int32_t token = 0; u_int16_t result = 0; if (msg->header.arglen != sizeof(*ip)) return (EMSGSIZE); ip = (ng_l2cap_l2ca_con_ind_ip *)(msg->data); NG_BTSOCKET_L2CAP_INFO( "%s: Got L2CA_Connect indicator, src bdaddr=%x:%x:%x:%x:%x:%x, " \ "dst bdaddr=%x:%x:%x:%x:%x:%x, psm=%d, lcid=%d, ident=%d\n", __func__, rt->src.b[5], rt->src.b[4], rt->src.b[3], rt->src.b[2], rt->src.b[1], rt->src.b[0], ip->bdaddr.b[5], ip->bdaddr.b[4], ip->bdaddr.b[3], ip->bdaddr.b[2], ip->bdaddr.b[1], ip->bdaddr.b[0], ip->psm, ip->lcid, ip->ident); mtx_lock(&ng_btsocket_l2cap_sockets_mtx); pcb = ng_btsocket_l2cap_pcb_by_addr(&rt->src, ip->psm); if (pcb != NULL) { - struct socket *so1 = NULL; + struct socket *so1; mtx_lock(&pcb->pcb_mtx); - /* - * First check the pending connections queue and if we have - * space then create new socket and set proper source address. - */ - - if (pcb->so->so_qlen <= pcb->so->so_qlimit) { - CURVNET_SET(pcb->so->so_vnet); - so1 = sonewconn(pcb->so, 0); - CURVNET_RESTORE(); - } - + CURVNET_SET(pcb->so->so_vnet); + so1 = sonewconn(pcb->so, 0); + CURVNET_RESTORE(); if (so1 == NULL) { result = NG_L2CAP_NO_RESOURCES; goto respond; } /* * If we got here than we have created new socket. So complete * connection. If we we listening on specific address then copy * source address from listening socket, otherwise copy source * address from hook's routing information. */ pcb1 = so2l2cap_pcb(so1); KASSERT((pcb1 != NULL), ("%s: pcb1 == NULL\n", __func__)); mtx_lock(&pcb1->pcb_mtx); if (bcmp(&pcb->src, NG_HCI_BDADDR_ANY, sizeof(pcb->src)) != 0) bcopy(&pcb->src, &pcb1->src, sizeof(pcb1->src)); else bcopy(&rt->src, &pcb1->src, sizeof(pcb1->src)); pcb1->flags &= ~NG_BTSOCKET_L2CAP_CLIENT; bcopy(&ip->bdaddr, &pcb1->dst, sizeof(pcb1->dst)); pcb1->psm = ip->psm; pcb1->cid = ip->lcid; pcb1->rt = rt; /* Copy socket settings */ pcb1->imtu = pcb->imtu; bcopy(&pcb->oflow, &pcb1->oflow, sizeof(pcb1->oflow)); pcb1->flush_timo = pcb->flush_timo; token = pcb1->token; } else /* Nobody listens on requested BDADDR/PSM */ result = NG_L2CAP_PSM_NOT_SUPPORTED; respond: error = ng_btsocket_l2cap_send_l2ca_con_rsp_req(token, rt, &ip->bdaddr, ip->ident, ip->lcid, result,ip->linktype); if (pcb1 != NULL) { if (error != 0) { pcb1->so->so_error = error; pcb1->state = NG_BTSOCKET_L2CAP_CLOSED; soisdisconnected(pcb1->so); } else { pcb1->state = NG_BTSOCKET_L2CAP_CONNECTING; soisconnecting(pcb1->so); ng_btsocket_l2cap_timeout(pcb1); } mtx_unlock(&pcb1->pcb_mtx); } if (pcb != NULL) mtx_unlock(&pcb->pcb_mtx); mtx_unlock(&ng_btsocket_l2cap_sockets_mtx); return (error); } /* ng_btsocket_l2cap_process_l2ca_con_ind */ /*Encryption Change*/ static int ng_btsocket_l2cap_process_l2ca_enc_change(struct ng_mesg *msg, ng_btsocket_l2cap_rtentry_p rt) { ng_l2cap_l2ca_enc_chg_op *op = NULL; ng_btsocket_l2cap_pcb_t *pcb = NULL; if (msg->header.arglen != sizeof(*op)) return (EMSGSIZE); op = (ng_l2cap_l2ca_enc_chg_op *)(msg->data); mtx_lock(&ng_btsocket_l2cap_sockets_mtx); pcb = ng_btsocket_l2cap_pcb_by_cid(&rt->src, op->lcid, op->idtype); if (pcb == NULL) { mtx_unlock(&ng_btsocket_l2cap_sockets_mtx); return (ENOENT); } mtx_lock(&pcb->pcb_mtx); pcb->encryption = op->result; if(pcb->need_encrypt){ ng_btsocket_l2cap_untimeout(pcb); if(pcb->state != NG_BTSOCKET_L2CAP_W4_ENC_CHANGE){ NG_BTSOCKET_L2CAP_WARN("%s: Invalid pcb status %d", __func__, pcb->state); }else if(pcb->encryption){ pcb->state = NG_BTSOCKET_L2CAP_OPEN; soisconnected(pcb->so); }else{ pcb->so->so_error = EPERM; ng_btsocket_l2cap_send_l2ca_discon_req(0, pcb); pcb->state = NG_BTSOCKET_L2CAP_CLOSED; soisdisconnected(pcb->so); } } mtx_unlock(&pcb->pcb_mtx); mtx_unlock(&ng_btsocket_l2cap_sockets_mtx); return 0; } /* * Process L2CA_Config response */ static int ng_btsocket_l2cap_process_l2ca_cfg_req_rsp(struct ng_mesg *msg, ng_btsocket_l2cap_rtentry_p rt) { ng_l2cap_l2ca_cfg_op *op = NULL; ng_btsocket_l2cap_pcb_p pcb = NULL; if (msg->header.arglen != sizeof(*op)) return (EMSGSIZE); op = (ng_l2cap_l2ca_cfg_op *)(msg->data); mtx_lock(&ng_btsocket_l2cap_sockets_mtx); /* * Socket must have issued a Configure request, so we must have a * socket that wants to be configured. Use Netgraph message token * to find it */ pcb = ng_btsocket_l2cap_pcb_by_token(msg->header.token); if (pcb == NULL) { /* * XXX FIXME what to do here? We could not find a * socket with requested token. We even can not send * Disconnect, because we do not know channel ID */ mtx_unlock(&ng_btsocket_l2cap_sockets_mtx); return (ENOENT); } mtx_lock(&pcb->pcb_mtx); NG_BTSOCKET_L2CAP_INFO( "%s: Got L2CA_Config response, token=%d, src bdaddr=%x:%x:%x:%x:%x:%x, " \ "dst bdaddr=%x:%x:%x:%x:%x:%x, psm=%d, lcid=%d, result=%d, state=%d, " \ "cfg_state=%x\n", __func__, msg->header.token, pcb->src.b[5], pcb->src.b[4], pcb->src.b[3], pcb->src.b[2], pcb->src.b[1], pcb->src.b[0], pcb->dst.b[5], pcb->dst.b[4], pcb->dst.b[3], pcb->dst.b[2], pcb->dst.b[1], pcb->dst.b[0], pcb->psm, pcb->cid, op->result, pcb->state, pcb->cfg_state); if (pcb->state != NG_BTSOCKET_L2CAP_CONFIGURING) { mtx_unlock(&pcb->pcb_mtx); mtx_unlock(&ng_btsocket_l2cap_sockets_mtx); return (ENOENT); } if (op->result == NG_L2CAP_SUCCESS) { /* * XXX FIXME Actually set flush and link timeout. * Set QoS here if required. Resolve conficts (flush_timo). * Save incoming MTU (peer's outgoing MTU) and outgoing flow * spec. */ pcb->imtu = op->imtu; bcopy(&op->oflow, &pcb->oflow, sizeof(pcb->oflow)); pcb->flush_timo = op->flush_timo; /* * We have configured incoming side, so record it and check * if configuration is complete. If complete then mark socket * as connected, otherwise wait for the peer. */ pcb->cfg_state &= ~NG_BTSOCKET_L2CAP_CFG_IN_SENT; pcb->cfg_state |= NG_BTSOCKET_L2CAP_CFG_IN; if (pcb->cfg_state == NG_BTSOCKET_L2CAP_CFG_BOTH) { /* Configuration complete - mark socket as open */ ng_btsocket_l2cap_untimeout(pcb); pcb->state = NG_BTSOCKET_L2CAP_OPEN; soisconnected(pcb->so); } } else { /* * Something went wrong. Could be unacceptable parameters, * reject or unknown option. That's too bad, but we will * not negotiate. Send Disconnect and close the channel. */ ng_btsocket_l2cap_untimeout(pcb); switch (op->result) { case NG_L2CAP_UNACCEPTABLE_PARAMS: case NG_L2CAP_UNKNOWN_OPTION: pcb->so->so_error = EINVAL; break; default: pcb->so->so_error = ECONNRESET; break; } /* Send disconnect with "zero" token */ ng_btsocket_l2cap_send_l2ca_discon_req(0, pcb); /* ... and close the socket */ pcb->state = NG_BTSOCKET_L2CAP_CLOSED; soisdisconnected(pcb->so); } mtx_unlock(&pcb->pcb_mtx); mtx_unlock(&ng_btsocket_l2cap_sockets_mtx); return (0); } /* ng_btsocket_l2cap_process_l2ca_cfg_req_rsp */ /* * Process L2CA_ConfigRsp response */ static int ng_btsocket_l2cap_process_l2ca_cfg_rsp_rsp(struct ng_mesg *msg, ng_btsocket_l2cap_rtentry_p rt) { ng_l2cap_l2ca_cfg_rsp_op *op = NULL; ng_btsocket_l2cap_pcb_t *pcb = NULL; int error = 0; if (msg->header.arglen != sizeof(*op)) return (EMSGSIZE); op = (ng_l2cap_l2ca_cfg_rsp_op *)(msg->data); mtx_lock(&ng_btsocket_l2cap_sockets_mtx); /* Look for the socket with the token */ pcb = ng_btsocket_l2cap_pcb_by_token(msg->header.token); if (pcb == NULL) { mtx_unlock(&ng_btsocket_l2cap_sockets_mtx); return (ENOENT); } mtx_lock(&pcb->pcb_mtx); NG_BTSOCKET_L2CAP_INFO( "%s: Got L2CA_ConfigRsp response, token=%d, src bdaddr=%x:%x:%x:%x:%x:%x, " \ "dst bdaddr=%x:%x:%x:%x:%x:%x, psm=%d, lcid=%d, result=%d, state=%d, " \ "cfg_state=%x\n", __func__, msg->header.token, pcb->src.b[5], pcb->src.b[4], pcb->src.b[3], pcb->src.b[2], pcb->src.b[1], pcb->src.b[0], pcb->dst.b[5], pcb->dst.b[4], pcb->dst.b[3], pcb->dst.b[2], pcb->dst.b[1], pcb->dst.b[0], pcb->psm, pcb->cid, op->result, pcb->state, pcb->cfg_state); if (pcb->state != NG_BTSOCKET_L2CAP_CONFIGURING) { mtx_unlock(&pcb->pcb_mtx); mtx_unlock(&ng_btsocket_l2cap_sockets_mtx); return (ENOENT); } /* Check the result and disconnect socket of failure */ if (op->result != NG_L2CAP_SUCCESS) goto disconnect; /* * Now we done with remote side configuration. Configure local * side if we have not done it yet. */ pcb->cfg_state &= ~NG_BTSOCKET_L2CAP_CFG_OUT_SENT; pcb->cfg_state |= NG_BTSOCKET_L2CAP_CFG_OUT; if (pcb->cfg_state == NG_BTSOCKET_L2CAP_CFG_BOTH) { /* Configuration complete - mask socket as open */ ng_btsocket_l2cap_untimeout(pcb); pcb->state = NG_BTSOCKET_L2CAP_OPEN; soisconnected(pcb->so); } else { if (!(pcb->cfg_state & NG_BTSOCKET_L2CAP_CFG_IN_SENT)) { /* Send L2CA_Config request - incoming path */ error = ng_btsocket_l2cap_send_l2ca_cfg_req(pcb); if (error != 0) goto disconnect; pcb->cfg_state |= NG_BTSOCKET_L2CAP_CFG_IN_SENT; } } mtx_unlock(&pcb->pcb_mtx); mtx_unlock(&ng_btsocket_l2cap_sockets_mtx); return (error); disconnect: ng_btsocket_l2cap_untimeout(pcb); /* Send disconnect with "zero" token */ ng_btsocket_l2cap_send_l2ca_discon_req(0, pcb); /* ... and close the socket */ pcb->state = NG_BTSOCKET_L2CAP_CLOSED; soisdisconnected(pcb->so); mtx_unlock(&pcb->pcb_mtx); mtx_unlock(&ng_btsocket_l2cap_sockets_mtx); return (error); } /* ng_btsocket_l2cap_process_l2ca_cfg_rsp_rsp */ /* * Process L2CA_Config indicator */ static int ng_btsocket_l2cap_process_l2ca_cfg_ind(struct ng_mesg *msg, ng_btsocket_l2cap_rtentry_p rt) { ng_l2cap_l2ca_cfg_ind_ip *ip = NULL; ng_btsocket_l2cap_pcb_t *pcb = NULL; int error = 0; if (msg->header.arglen != sizeof(*ip)) return (EMSGSIZE); ip = (ng_l2cap_l2ca_cfg_ind_ip *)(msg->data); mtx_lock(&ng_btsocket_l2cap_sockets_mtx); /* Check for the open socket that has given channel ID */ pcb = ng_btsocket_l2cap_pcb_by_cid(&rt->src, ip->lcid, NG_L2CAP_L2CA_IDTYPE_BREDR); if (pcb == NULL) { mtx_unlock(&ng_btsocket_l2cap_sockets_mtx); return (ENOENT); } mtx_lock(&pcb->pcb_mtx); NG_BTSOCKET_L2CAP_INFO( "%s: Got L2CA_Config indicator, src bdaddr=%x:%x:%x:%x:%x:%x, " \ "dst bdaddr=%x:%x:%x:%x:%x:%x, psm=%d, lcid=%d, state=%d, cfg_state=%x\n", __func__, pcb->src.b[5], pcb->src.b[4], pcb->src.b[3], pcb->src.b[2], pcb->src.b[1], pcb->src.b[0], pcb->dst.b[5], pcb->dst.b[4], pcb->dst.b[3], pcb->dst.b[2], pcb->dst.b[1], pcb->dst.b[0], pcb->psm, pcb->cid, pcb->state, pcb->cfg_state); /* XXX FIXME re-configuration on open socket */ if (pcb->state != NG_BTSOCKET_L2CAP_CONFIGURING) { mtx_unlock(&pcb->pcb_mtx); mtx_unlock(&ng_btsocket_l2cap_sockets_mtx); return (ENOENT); } /* * XXX FIXME Actually set flush and link timeout. Set QoS here if * required. Resolve conficts (flush_timo). Note outgoing MTU (peer's * incoming MTU) and incoming flow spec. */ pcb->omtu = ip->omtu; bcopy(&ip->iflow, &pcb->iflow, sizeof(pcb->iflow)); pcb->flush_timo = ip->flush_timo; /* * Send L2CA_Config response to our peer and check for the errors, * if any send disconnect to close the channel. */ if (!(pcb->cfg_state & NG_BTSOCKET_L2CAP_CFG_OUT_SENT)) { error = ng_btsocket_l2cap_send_l2ca_cfg_rsp(pcb); if (error != 0) { ng_btsocket_l2cap_untimeout(pcb); pcb->so->so_error = error; /* Send disconnect with "zero" token */ ng_btsocket_l2cap_send_l2ca_discon_req(0, pcb); /* ... and close the socket */ pcb->state = NG_BTSOCKET_L2CAP_CLOSED; soisdisconnected(pcb->so); } else pcb->cfg_state |= NG_BTSOCKET_L2CAP_CFG_OUT_SENT; } mtx_unlock(&pcb->pcb_mtx); mtx_unlock(&ng_btsocket_l2cap_sockets_mtx); return (error); } /* ng_btsocket_l2cap_process_l2cap_cfg_ind */ /* * Process L2CA_Disconnect response */ static int ng_btsocket_l2cap_process_l2ca_discon_rsp(struct ng_mesg *msg, ng_btsocket_l2cap_rtentry_p rt) { ng_l2cap_l2ca_discon_op *op = NULL; ng_btsocket_l2cap_pcb_t *pcb = NULL; /* Check message */ if (msg->header.arglen != sizeof(*op)) return (EMSGSIZE); op = (ng_l2cap_l2ca_discon_op *)(msg->data); mtx_lock(&ng_btsocket_l2cap_sockets_mtx); /* * Socket layer must have issued L2CA_Disconnect request, so there * must be a socket that wants to be disconnected. Use Netgraph * message token to find it. */ pcb = ng_btsocket_l2cap_pcb_by_token(msg->header.token); if (pcb == NULL) { mtx_unlock(&ng_btsocket_l2cap_sockets_mtx); return (0); } mtx_lock(&pcb->pcb_mtx); /* XXX Close socket no matter what op->result says */ if (pcb->state != NG_BTSOCKET_L2CAP_CLOSED) { NG_BTSOCKET_L2CAP_INFO( "%s: Got L2CA_Disconnect response, token=%d, src bdaddr=%x:%x:%x:%x:%x:%x, " \ "dst bdaddr=%x:%x:%x:%x:%x:%x, psm=%d, lcid=%d, result=%d, state=%d\n", __func__, msg->header.token, pcb->src.b[5], pcb->src.b[4], pcb->src.b[3], pcb->src.b[2], pcb->src.b[1], pcb->src.b[0], pcb->dst.b[5], pcb->dst.b[4], pcb->dst.b[3], pcb->dst.b[2], pcb->dst.b[1], pcb->dst.b[0], pcb->psm, pcb->cid, op->result, pcb->state); ng_btsocket_l2cap_untimeout(pcb); pcb->state = NG_BTSOCKET_L2CAP_CLOSED; soisdisconnected(pcb->so); } mtx_unlock(&pcb->pcb_mtx); mtx_unlock(&ng_btsocket_l2cap_sockets_mtx); return (0); } /* ng_btsocket_l2cap_process_l2ca_discon_rsp */ /* * Process L2CA_Disconnect indicator */ static int ng_btsocket_l2cap_process_l2ca_discon_ind(struct ng_mesg *msg, ng_btsocket_l2cap_rtentry_p rt) { ng_l2cap_l2ca_discon_ind_ip *ip = NULL; ng_btsocket_l2cap_pcb_t *pcb = NULL; /* Check message */ if (msg->header.arglen != sizeof(*ip)) return (EMSGSIZE); ip = (ng_l2cap_l2ca_discon_ind_ip *)(msg->data); mtx_lock(&ng_btsocket_l2cap_sockets_mtx); /* Look for the socket with given channel ID */ pcb = ng_btsocket_l2cap_pcb_by_cid(&rt->src, ip->lcid, ip->idtype); if (pcb == NULL) { mtx_unlock(&ng_btsocket_l2cap_sockets_mtx); return (0); } /* * Channel has already been destroyed, so disconnect the socket * and be done with it. If there was any pending request we can * not do anything here anyway. */ mtx_lock(&pcb->pcb_mtx); NG_BTSOCKET_L2CAP_INFO( "%s: Got L2CA_Disconnect indicator, src bdaddr=%x:%x:%x:%x:%x:%x, " \ "dst bdaddr=%x:%x:%x:%x:%x:%x, psm=%d, lcid=%d, state=%d\n", __func__, pcb->src.b[5], pcb->src.b[4], pcb->src.b[3], pcb->src.b[2], pcb->src.b[1], pcb->src.b[0], pcb->dst.b[5], pcb->dst.b[4], pcb->dst.b[3], pcb->dst.b[2], pcb->dst.b[1], pcb->dst.b[0], pcb->psm, pcb->cid, pcb->state); if (pcb->flags & NG_BTSOCKET_L2CAP_TIMO) ng_btsocket_l2cap_untimeout(pcb); pcb->state = NG_BTSOCKET_L2CAP_CLOSED; soisdisconnected(pcb->so); mtx_unlock(&pcb->pcb_mtx); mtx_unlock(&ng_btsocket_l2cap_sockets_mtx); return (0); } /* ng_btsocket_l2cap_process_l2ca_discon_ind */ /* * Process L2CA_Write response */ static int ng_btsocket_l2cap_process_l2ca_write_rsp(struct ng_mesg *msg, ng_btsocket_l2cap_rtentry_p rt) { ng_l2cap_l2ca_write_op *op = NULL; ng_btsocket_l2cap_pcb_t *pcb = NULL; /* Check message */ if (msg->header.arglen != sizeof(*op)) return (EMSGSIZE); op = (ng_l2cap_l2ca_write_op *)(msg->data); mtx_lock(&ng_btsocket_l2cap_sockets_mtx); /* Look for the socket with given token */ pcb = ng_btsocket_l2cap_pcb_by_token(msg->header.token); if (pcb == NULL) { mtx_unlock(&ng_btsocket_l2cap_sockets_mtx); return (ENOENT); } mtx_lock(&pcb->pcb_mtx); NG_BTSOCKET_L2CAP_INFO( "%s: Got L2CA_Write response, src bdaddr=%x:%x:%x:%x:%x:%x, " \ "dst bdaddr=%x:%x:%x:%x:%x:%x, psm=%d, lcid=%d, result=%d, length=%d, " \ "state=%d\n", __func__, pcb->src.b[5], pcb->src.b[4], pcb->src.b[3], pcb->src.b[2], pcb->src.b[1], pcb->src.b[0], pcb->dst.b[5], pcb->dst.b[4], pcb->dst.b[3], pcb->dst.b[2], pcb->dst.b[1], pcb->dst.b[0], pcb->psm, pcb->cid, op->result, op->length, pcb->state); if (pcb->state != NG_BTSOCKET_L2CAP_OPEN) { mtx_unlock(&pcb->pcb_mtx); mtx_unlock(&ng_btsocket_l2cap_sockets_mtx); return (ENOENT); } ng_btsocket_l2cap_untimeout(pcb); /* * Check if we have more data to send */ sbdroprecord(&pcb->so->so_snd); if (sbavail(&pcb->so->so_snd) > 0) { if (ng_btsocket_l2cap_send2(pcb) == 0) ng_btsocket_l2cap_timeout(pcb); else sbdroprecord(&pcb->so->so_snd); /* XXX */ } /* * Now set the result, drop packet from the socket send queue and * ask for more (wakeup sender) */ pcb->so->so_error = ng_btsocket_l2cap_result2errno(op->result); sowwakeup(pcb->so); mtx_unlock(&pcb->pcb_mtx); mtx_unlock(&ng_btsocket_l2cap_sockets_mtx); return (0); } /* ng_btsocket_l2cap_process_l2ca_write_rsp */ /* * Send L2CA_Connect request */ static int ng_btsocket_l2cap_send_l2ca_con_req(ng_btsocket_l2cap_pcb_p pcb) { struct ng_mesg *msg = NULL; ng_l2cap_l2ca_con_ip *ip = NULL; int error = 0; mtx_assert(&pcb->pcb_mtx, MA_OWNED); if (pcb->rt == NULL || pcb->rt->hook == NULL || NG_HOOK_NOT_VALID(pcb->rt->hook)) return (ENETDOWN); NG_MKMESSAGE(msg, NGM_L2CAP_COOKIE, NGM_L2CAP_L2CA_CON, sizeof(*ip), M_NOWAIT); if (msg == NULL) return (ENOMEM); msg->header.token = pcb->token; ip = (ng_l2cap_l2ca_con_ip *)(msg->data); bcopy(&pcb->dst, &ip->bdaddr, sizeof(ip->bdaddr)); ip->psm = pcb->psm; ip->linktype = ng_btsock_l2cap_addrtype_to_linktype(pcb->dsttype); ip->idtype = pcb->idtype; NG_SEND_MSG_HOOK(error, ng_btsocket_l2cap_node, msg,pcb->rt->hook, 0); return (error); } /* ng_btsocket_l2cap_send_l2ca_con_req */ /* * Send L2CA_Connect response */ static int ng_btsocket_l2cap_send_l2ca_con_rsp_req(u_int32_t token, ng_btsocket_l2cap_rtentry_p rt, bdaddr_p dst, int ident, int lcid, int result, int linktype) { struct ng_mesg *msg = NULL; ng_l2cap_l2ca_con_rsp_ip *ip = NULL; int error = 0; if (rt == NULL || rt->hook == NULL || NG_HOOK_NOT_VALID(rt->hook)) return (ENETDOWN); NG_MKMESSAGE(msg, NGM_L2CAP_COOKIE, NGM_L2CAP_L2CA_CON_RSP, sizeof(*ip), M_NOWAIT); if (msg == NULL) return (ENOMEM); msg->header.token = token; ip = (ng_l2cap_l2ca_con_rsp_ip *)(msg->data); bcopy(dst, &ip->bdaddr, sizeof(ip->bdaddr)); ip->ident = ident; ip->lcid = lcid; ip->linktype = linktype; ip->result = result; ip->status = 0; NG_SEND_MSG_HOOK(error, ng_btsocket_l2cap_node, msg, rt->hook, 0); return (error); } /* ng_btsocket_l2cap_send_l2ca_con_rsp_req */ /* * Send L2CA_Config request */ static int ng_btsocket_l2cap_send_l2ca_cfg_req(ng_btsocket_l2cap_pcb_p pcb) { struct ng_mesg *msg = NULL; ng_l2cap_l2ca_cfg_ip *ip = NULL; int error = 0; mtx_assert(&pcb->pcb_mtx, MA_OWNED); if (pcb->rt == NULL || pcb->rt->hook == NULL || NG_HOOK_NOT_VALID(pcb->rt->hook)) return (ENETDOWN); NG_MKMESSAGE(msg, NGM_L2CAP_COOKIE, NGM_L2CAP_L2CA_CFG, sizeof(*ip), M_NOWAIT); if (msg == NULL) return (ENOMEM); msg->header.token = pcb->token; ip = (ng_l2cap_l2ca_cfg_ip *)(msg->data); ip->lcid = pcb->cid; ip->imtu = pcb->imtu; bcopy(&pcb->oflow, &ip->oflow, sizeof(ip->oflow)); ip->flush_timo = pcb->flush_timo; ip->link_timo = pcb->link_timo; NG_SEND_MSG_HOOK(error, ng_btsocket_l2cap_node, msg,pcb->rt->hook, 0); return (error); } /* ng_btsocket_l2cap_send_l2ca_cfg_req */ /* * Send L2CA_Config response */ static int ng_btsocket_l2cap_send_l2ca_cfg_rsp(ng_btsocket_l2cap_pcb_p pcb) { struct ng_mesg *msg = NULL; ng_l2cap_l2ca_cfg_rsp_ip *ip = NULL; int error = 0; mtx_assert(&pcb->pcb_mtx, MA_OWNED); if (pcb->rt == NULL || pcb->rt->hook == NULL || NG_HOOK_NOT_VALID(pcb->rt->hook)) return (ENETDOWN); NG_MKMESSAGE(msg, NGM_L2CAP_COOKIE, NGM_L2CAP_L2CA_CFG_RSP, sizeof(*ip), M_NOWAIT); if (msg == NULL) return (ENOMEM); msg->header.token = pcb->token; ip = (ng_l2cap_l2ca_cfg_rsp_ip *)(msg->data); ip->lcid = pcb->cid; ip->omtu = pcb->omtu; bcopy(&pcb->iflow, &ip->iflow, sizeof(ip->iflow)); NG_SEND_MSG_HOOK(error, ng_btsocket_l2cap_node, msg, pcb->rt->hook, 0); return (error); } /* ng_btsocket_l2cap_send_l2ca_cfg_rsp */ /* * Send L2CA_Disconnect request */ static int ng_btsocket_l2cap_send_l2ca_discon_req(u_int32_t token, ng_btsocket_l2cap_pcb_p pcb) { struct ng_mesg *msg = NULL; ng_l2cap_l2ca_discon_ip *ip = NULL; int error = 0; mtx_assert(&pcb->pcb_mtx, MA_OWNED); if (pcb->rt == NULL || pcb->rt->hook == NULL || NG_HOOK_NOT_VALID(pcb->rt->hook)) return (ENETDOWN); NG_MKMESSAGE(msg, NGM_L2CAP_COOKIE, NGM_L2CAP_L2CA_DISCON, sizeof(*ip), M_NOWAIT); if (msg == NULL) return (ENOMEM); msg->header.token = token; ip = (ng_l2cap_l2ca_discon_ip *)(msg->data); ip->lcid = pcb->cid; ip->idtype = pcb->idtype; NG_SEND_MSG_HOOK(error, ng_btsocket_l2cap_node, msg,pcb->rt->hook, 0); return (error); } /* ng_btsocket_l2cap_send_l2ca_discon_req */ /***************************************************************************** ***************************************************************************** ** Socket interface ***************************************************************************** *****************************************************************************/ /* * L2CAP sockets data input routine */ static void ng_btsocket_l2cap_data_input(struct mbuf *m, hook_p hook) { ng_l2cap_hdr_t *hdr = NULL; ng_l2cap_clt_hdr_t *clt_hdr = NULL; ng_btsocket_l2cap_pcb_t *pcb = NULL; ng_btsocket_l2cap_rtentry_t *rt = NULL; uint16_t idtype; if (hook == NULL) { NG_BTSOCKET_L2CAP_ALERT( "%s: Invalid source hook for L2CAP data packet\n", __func__); goto drop; } rt = (ng_btsocket_l2cap_rtentry_t *) NG_HOOK_PRIVATE(hook); if (rt == NULL) { NG_BTSOCKET_L2CAP_ALERT( "%s: Could not find out source bdaddr for L2CAP data packet\n", __func__); goto drop; } m = m_pullup(m, sizeof(uint16_t)); idtype = *mtod(m, uint16_t *); m_adj(m, sizeof(uint16_t)); /* Make sure we can access header */ if (m->m_pkthdr.len < sizeof(*hdr)) { NG_BTSOCKET_L2CAP_ERR( "%s: L2CAP data packet too small, len=%d\n", __func__, m->m_pkthdr.len); goto drop; } if (m->m_len < sizeof(*hdr)) { m = m_pullup(m, sizeof(*hdr)); if (m == NULL) goto drop; } /* Strip L2CAP packet header and verify packet length */ hdr = mtod(m, ng_l2cap_hdr_t *); m_adj(m, sizeof(*hdr)); if (hdr->length != m->m_pkthdr.len) { NG_BTSOCKET_L2CAP_ERR( "%s: Bad L2CAP data packet length, len=%d, length=%d\n", __func__, m->m_pkthdr.len, hdr->length); goto drop; } /* * Now process packet. Two cases: * * 1) Normal packet (cid != 2) then find connected socket and append * mbuf to the socket queue. Wakeup socket. * * 2) Broadcast packet (cid == 2) then find all sockets that connected * to the given PSM and have SO_BROADCAST bit set and append mbuf * to the socket queue. Wakeup socket. */ NG_BTSOCKET_L2CAP_INFO( "%s: Received L2CAP data packet: src bdaddr=%x:%x:%x:%x:%x:%x, " \ "dcid=%d, length=%d\n", __func__, rt->src.b[5], rt->src.b[4], rt->src.b[3], rt->src.b[2], rt->src.b[1], rt->src.b[0], hdr->dcid, hdr->length); if ((hdr->dcid >= NG_L2CAP_FIRST_CID) || (idtype == NG_L2CAP_L2CA_IDTYPE_ATT)|| (idtype == NG_L2CAP_L2CA_IDTYPE_SMP) ){ mtx_lock(&ng_btsocket_l2cap_sockets_mtx); /* Normal packet: find connected socket */ pcb = ng_btsocket_l2cap_pcb_by_cid(&rt->src, hdr->dcid,idtype); if (pcb == NULL) { mtx_unlock(&ng_btsocket_l2cap_sockets_mtx); goto drop; } mtx_lock(&pcb->pcb_mtx); if (pcb->state != NG_BTSOCKET_L2CAP_OPEN) { NG_BTSOCKET_L2CAP_ERR( "%s: No connected socket found, src bdaddr=%x:%x:%x:%x:%x:%x, dcid=%d, " \ "state=%d\n", __func__, rt->src.b[5], rt->src.b[4], rt->src.b[3], rt->src.b[2], rt->src.b[1], rt->src.b[0], hdr->dcid, pcb->state); mtx_unlock(&pcb->pcb_mtx); mtx_unlock(&ng_btsocket_l2cap_sockets_mtx); goto drop; } /* Check packet size against socket's incoming MTU */ if (hdr->length > pcb->imtu) { NG_BTSOCKET_L2CAP_ERR( "%s: L2CAP data packet too big, src bdaddr=%x:%x:%x:%x:%x:%x, " \ "dcid=%d, length=%d, imtu=%d\n", __func__, rt->src.b[5], rt->src.b[4], rt->src.b[3], rt->src.b[2], rt->src.b[1], rt->src.b[0], hdr->dcid, hdr->length, pcb->imtu); mtx_unlock(&pcb->pcb_mtx); mtx_unlock(&ng_btsocket_l2cap_sockets_mtx); goto drop; } /* Check if we have enough space in socket receive queue */ if (m->m_pkthdr.len > sbspace(&pcb->so->so_rcv)) { /* * This is really bad. Receive queue on socket does * not have enough space for the packet. We do not * have any other choice but drop the packet. L2CAP * does not provide any flow control. */ NG_BTSOCKET_L2CAP_ERR( "%s: Not enough space in socket receive queue. Dropping L2CAP data packet, " \ "src bdaddr=%x:%x:%x:%x:%x:%x, dcid=%d, len=%d, space=%ld\n", __func__, rt->src.b[5], rt->src.b[4], rt->src.b[3], rt->src.b[2], rt->src.b[1], rt->src.b[0], hdr->dcid, m->m_pkthdr.len, sbspace(&pcb->so->so_rcv)); mtx_unlock(&pcb->pcb_mtx); mtx_unlock(&ng_btsocket_l2cap_sockets_mtx); goto drop; } /* Append packet to the socket receive queue and wakeup */ sbappendrecord(&pcb->so->so_rcv, m); m = NULL; sorwakeup(pcb->so); mtx_unlock(&pcb->pcb_mtx); mtx_unlock(&ng_btsocket_l2cap_sockets_mtx); } else if (hdr->dcid == NG_L2CAP_CLT_CID) { /* Broadcast packet: give packet to all sockets */ /* Check packet size against connectionless MTU */ if (hdr->length > NG_L2CAP_MTU_DEFAULT) { NG_BTSOCKET_L2CAP_ERR( "%s: Connectionless L2CAP data packet too big, " \ "src bdaddr=%x:%x:%x:%x:%x:%x, length=%d\n", __func__, rt->src.b[5], rt->src.b[4], rt->src.b[3], rt->src.b[2], rt->src.b[1], rt->src.b[0], hdr->length); goto drop; } /* Make sure we can access connectionless header */ if (m->m_pkthdr.len < sizeof(*clt_hdr)) { NG_BTSOCKET_L2CAP_ERR( "%s: Can not get L2CAP connectionless packet header, " \ "src bdaddr=%x:%x:%x:%x:%x:%x, length=%d\n", __func__, rt->src.b[5], rt->src.b[4], rt->src.b[3], rt->src.b[2], rt->src.b[1], rt->src.b[0], hdr->length); goto drop; } if (m->m_len < sizeof(*clt_hdr)) { m = m_pullup(m, sizeof(*clt_hdr)); if (m == NULL) goto drop; } /* Strip connectionless header and deliver packet */ clt_hdr = mtod(m, ng_l2cap_clt_hdr_t *); m_adj(m, sizeof(*clt_hdr)); NG_BTSOCKET_L2CAP_INFO( "%s: Got L2CAP connectionless data packet, " \ "src bdaddr=%x:%x:%x:%x:%x:%x, psm=%d, length=%d\n", __func__, rt->src.b[5], rt->src.b[4], rt->src.b[3], rt->src.b[2], rt->src.b[1], rt->src.b[0], clt_hdr->psm, hdr->length); mtx_lock(&ng_btsocket_l2cap_sockets_mtx); LIST_FOREACH(pcb, &ng_btsocket_l2cap_sockets, next) { struct mbuf *copy = NULL; mtx_lock(&pcb->pcb_mtx); if (bcmp(&rt->src, &pcb->src, sizeof(pcb->src)) != 0 || pcb->psm != clt_hdr->psm || pcb->state != NG_BTSOCKET_L2CAP_OPEN || (pcb->so->so_options & SO_BROADCAST) == 0 || m->m_pkthdr.len > sbspace(&pcb->so->so_rcv)) goto next; /* * Create a copy of the packet and append it to the * socket's queue. If m_dup() failed - no big deal * it is a broadcast traffic after all */ copy = m_dup(m, M_NOWAIT); if (copy != NULL) { sbappendrecord(&pcb->so->so_rcv, copy); sorwakeup(pcb->so); } next: mtx_unlock(&pcb->pcb_mtx); } mtx_unlock(&ng_btsocket_l2cap_sockets_mtx); } drop: NG_FREE_M(m); /* checks for m != NULL */ } /* ng_btsocket_l2cap_data_input */ /* * L2CAP sockets default message input routine */ static void ng_btsocket_l2cap_default_msg_input(struct ng_mesg *msg, hook_p hook) { switch (msg->header.cmd) { case NGM_L2CAP_NODE_HOOK_INFO: { ng_btsocket_l2cap_rtentry_t *rt = NULL; ng_l2cap_node_hook_info_ep *ep = (ng_l2cap_node_hook_info_ep *)msg->data; if (hook == NULL || msg->header.arglen != sizeof(*ep)) break; if (bcmp(&ep->addr, NG_HCI_BDADDR_ANY, sizeof(bdaddr_t)) == 0) break; mtx_lock(&ng_btsocket_l2cap_rt_mtx); rt = (ng_btsocket_l2cap_rtentry_t *) NG_HOOK_PRIVATE(hook); if (rt == NULL) { rt = malloc(sizeof(*rt), M_NETGRAPH_BTSOCKET_L2CAP, M_NOWAIT|M_ZERO); if (rt == NULL) { mtx_unlock(&ng_btsocket_l2cap_rt_mtx); break; } LIST_INSERT_HEAD(&ng_btsocket_l2cap_rt, rt, next); NG_HOOK_SET_PRIVATE(hook, rt); } bcopy(&ep->addr, &rt->src, sizeof(rt->src)); rt->hook = hook; mtx_unlock(&ng_btsocket_l2cap_rt_mtx); NG_BTSOCKET_L2CAP_INFO( "%s: Updating hook \"%s\", src bdaddr=%x:%x:%x:%x:%x:%x\n", __func__, NG_HOOK_NAME(hook), rt->src.b[5], rt->src.b[4], rt->src.b[3], rt->src.b[2], rt->src.b[1], rt->src.b[0]); } break; default: NG_BTSOCKET_L2CAP_WARN( "%s: Unknown message, cmd=%d\n", __func__, msg->header.cmd); break; } NG_FREE_MSG(msg); /* Checks for msg != NULL */ } /* ng_btsocket_l2cap_default_msg_input */ /* * L2CAP sockets L2CA message input routine */ static void ng_btsocket_l2cap_l2ca_msg_input(struct ng_mesg *msg, hook_p hook) { ng_btsocket_l2cap_rtentry_p rt = NULL; if (hook == NULL) { NG_BTSOCKET_L2CAP_ALERT( "%s: Invalid source hook for L2CA message\n", __func__); goto drop; } rt = (ng_btsocket_l2cap_rtentry_p) NG_HOOK_PRIVATE(hook); if (rt == NULL) { NG_BTSOCKET_L2CAP_ALERT( "%s: Could not find out source bdaddr for L2CA message\n", __func__); goto drop; } switch (msg->header.cmd) { case NGM_L2CAP_L2CA_CON: /* L2CA_Connect response */ ng_btsocket_l2cap_process_l2ca_con_req_rsp(msg, rt); break; case NGM_L2CAP_L2CA_CON_RSP: /* L2CA_ConnectRsp response */ ng_btsocket_l2cap_process_l2ca_con_rsp_rsp(msg, rt); break; case NGM_L2CAP_L2CA_CON_IND: /* L2CA_Connect indicator */ ng_btsocket_l2cap_process_l2ca_con_ind(msg, rt); break; case NGM_L2CAP_L2CA_CFG: /* L2CA_Config response */ ng_btsocket_l2cap_process_l2ca_cfg_req_rsp(msg, rt); break; case NGM_L2CAP_L2CA_CFG_RSP: /* L2CA_ConfigRsp response */ ng_btsocket_l2cap_process_l2ca_cfg_rsp_rsp(msg, rt); break; case NGM_L2CAP_L2CA_CFG_IND: /* L2CA_Config indicator */ ng_btsocket_l2cap_process_l2ca_cfg_ind(msg, rt); break; case NGM_L2CAP_L2CA_DISCON: /* L2CA_Disconnect response */ ng_btsocket_l2cap_process_l2ca_discon_rsp(msg, rt); break; case NGM_L2CAP_L2CA_DISCON_IND: /* L2CA_Disconnect indicator */ ng_btsocket_l2cap_process_l2ca_discon_ind(msg, rt); break; case NGM_L2CAP_L2CA_WRITE: /* L2CA_Write response */ ng_btsocket_l2cap_process_l2ca_write_rsp(msg, rt); break; case NGM_L2CAP_L2CA_ENC_CHANGE: ng_btsocket_l2cap_process_l2ca_enc_change(msg, rt); break; /* XXX FIXME add other L2CA messages */ default: NG_BTSOCKET_L2CAP_WARN( "%s: Unknown L2CA message, cmd=%d\n", __func__, msg->header.cmd); break; } drop: NG_FREE_MSG(msg); } /* ng_btsocket_l2cap_l2ca_msg_input */ /* * L2CAP sockets input routine */ static void ng_btsocket_l2cap_input(void *context, int pending) { item_p item = NULL; hook_p hook = NULL; for (;;) { mtx_lock(&ng_btsocket_l2cap_queue_mtx); NG_BT_ITEMQ_DEQUEUE(&ng_btsocket_l2cap_queue, item); mtx_unlock(&ng_btsocket_l2cap_queue_mtx); if (item == NULL) break; NGI_GET_HOOK(item, hook); if (hook != NULL && NG_HOOK_NOT_VALID(hook)) goto drop; switch(item->el_flags & NGQF_TYPE) { case NGQF_DATA: { struct mbuf *m = NULL; NGI_GET_M(item, m); ng_btsocket_l2cap_data_input(m, hook); } break; case NGQF_MESG: { struct ng_mesg *msg = NULL; NGI_GET_MSG(item, msg); switch (msg->header.cmd) { case NGM_L2CAP_L2CA_CON: case NGM_L2CAP_L2CA_CON_RSP: case NGM_L2CAP_L2CA_CON_IND: case NGM_L2CAP_L2CA_CFG: case NGM_L2CAP_L2CA_CFG_RSP: case NGM_L2CAP_L2CA_CFG_IND: case NGM_L2CAP_L2CA_DISCON: case NGM_L2CAP_L2CA_DISCON_IND: case NGM_L2CAP_L2CA_WRITE: case NGM_L2CAP_L2CA_ENC_CHANGE: /* XXX FIXME add other L2CA messages */ ng_btsocket_l2cap_l2ca_msg_input(msg, hook); break; default: ng_btsocket_l2cap_default_msg_input(msg, hook); break; } } break; default: KASSERT(0, ("%s: invalid item type=%ld\n", __func__, (item->el_flags & NGQF_TYPE))); break; } drop: if (hook != NULL) NG_HOOK_UNREF(hook); NG_FREE_ITEM(item); } } /* ng_btsocket_l2cap_input */ /* * Route cleanup task. Gets scheduled when hook is disconnected. Here we * will find all sockets that use "invalid" hook and disconnect them. */ static void ng_btsocket_l2cap_rtclean(void *context, int pending) { ng_btsocket_l2cap_pcb_p pcb = NULL, pcb_next = NULL; ng_btsocket_l2cap_rtentry_p rt = NULL; mtx_lock(&ng_btsocket_l2cap_rt_mtx); mtx_lock(&ng_btsocket_l2cap_sockets_mtx); /* * First disconnect all sockets that use "invalid" hook */ for (pcb = LIST_FIRST(&ng_btsocket_l2cap_sockets); pcb != NULL; ) { mtx_lock(&pcb->pcb_mtx); pcb_next = LIST_NEXT(pcb, next); if (pcb->rt != NULL && pcb->rt->hook != NULL && NG_HOOK_NOT_VALID(pcb->rt->hook)) { if (pcb->flags & NG_BTSOCKET_L2CAP_TIMO) ng_btsocket_l2cap_untimeout(pcb); pcb->so->so_error = ENETDOWN; pcb->state = NG_BTSOCKET_L2CAP_CLOSED; soisdisconnected(pcb->so); pcb->token = 0; pcb->cid = 0; pcb->rt = NULL; } mtx_unlock(&pcb->pcb_mtx); pcb = pcb_next; } /* * Now cleanup routing table */ for (rt = LIST_FIRST(&ng_btsocket_l2cap_rt); rt != NULL; ) { ng_btsocket_l2cap_rtentry_p rt_next = LIST_NEXT(rt, next); if (rt->hook != NULL && NG_HOOK_NOT_VALID(rt->hook)) { LIST_REMOVE(rt, next); NG_HOOK_SET_PRIVATE(rt->hook, NULL); NG_HOOK_UNREF(rt->hook); /* Remove extra reference */ bzero(rt, sizeof(*rt)); free(rt, M_NETGRAPH_BTSOCKET_L2CAP); } rt = rt_next; } mtx_unlock(&ng_btsocket_l2cap_sockets_mtx); mtx_unlock(&ng_btsocket_l2cap_rt_mtx); } /* ng_btsocket_l2cap_rtclean */ /* * Initialize everything */ void ng_btsocket_l2cap_init(void) { int error = 0; /* Skip initialization of globals for non-default instances. */ if (!IS_DEFAULT_VNET(curvnet)) return; ng_btsocket_l2cap_node = NULL; ng_btsocket_l2cap_debug_level = NG_BTSOCKET_WARN_LEVEL; /* Register Netgraph node type */ error = ng_newtype(&typestruct); if (error != 0) { NG_BTSOCKET_L2CAP_ALERT( "%s: Could not register Netgraph node type, error=%d\n", __func__, error); return; } /* Create Netgrapg node */ error = ng_make_node_common(&typestruct, &ng_btsocket_l2cap_node); if (error != 0) { NG_BTSOCKET_L2CAP_ALERT( "%s: Could not create Netgraph node, error=%d\n", __func__, error); ng_btsocket_l2cap_node = NULL; return; } error = ng_name_node(ng_btsocket_l2cap_node, NG_BTSOCKET_L2CAP_NODE_TYPE); if (error != 0) { NG_BTSOCKET_L2CAP_ALERT( "%s: Could not name Netgraph node, error=%d\n", __func__, error); NG_NODE_UNREF(ng_btsocket_l2cap_node); ng_btsocket_l2cap_node = NULL; return; } /* Create input queue */ NG_BT_ITEMQ_INIT(&ng_btsocket_l2cap_queue, ifqmaxlen); mtx_init(&ng_btsocket_l2cap_queue_mtx, "btsocks_l2cap_queue_mtx", NULL, MTX_DEF); TASK_INIT(&ng_btsocket_l2cap_queue_task, 0, ng_btsocket_l2cap_input, NULL); /* Create list of sockets */ LIST_INIT(&ng_btsocket_l2cap_sockets); mtx_init(&ng_btsocket_l2cap_sockets_mtx, "btsocks_l2cap_sockets_mtx", NULL, MTX_DEF); /* Routing table */ LIST_INIT(&ng_btsocket_l2cap_rt); mtx_init(&ng_btsocket_l2cap_rt_mtx, "btsocks_l2cap_rt_mtx", NULL, MTX_DEF); TASK_INIT(&ng_btsocket_l2cap_rt_task, 0, ng_btsocket_l2cap_rtclean, NULL); } /* ng_btsocket_l2cap_init */ /* * Abort connection on socket */ void ng_btsocket_l2cap_abort(struct socket *so) { so->so_error = ECONNABORTED; (void)ng_btsocket_l2cap_disconnect(so); } /* ng_btsocket_l2cap_abort */ void ng_btsocket_l2cap_close(struct socket *so) { (void)ng_btsocket_l2cap_disconnect(so); } /* ng_btsocket_l2cap_close */ /* * Accept connection on socket. Nothing to do here, socket must be connected * and ready, so just return peer address and be done with it. */ int ng_btsocket_l2cap_accept(struct socket *so, struct sockaddr **nam) { if (ng_btsocket_l2cap_node == NULL) return (EINVAL); return (ng_btsocket_l2cap_peeraddr(so, nam)); } /* ng_btsocket_l2cap_accept */ /* * Create and attach new socket */ int ng_btsocket_l2cap_attach(struct socket *so, int proto, struct thread *td) { static u_int32_t token = 0; ng_btsocket_l2cap_pcb_p pcb = so2l2cap_pcb(so); int error; /* Check socket and protocol */ if (ng_btsocket_l2cap_node == NULL) return (EPROTONOSUPPORT); if (so->so_type != SOCK_SEQPACKET) return (ESOCKTNOSUPPORT); #if 0 /* XXX sonewconn() calls "pru_attach" with proto == 0 */ if (proto != 0) if (proto != BLUETOOTH_PROTO_L2CAP) return (EPROTONOSUPPORT); #endif /* XXX */ if (pcb != NULL) return (EISCONN); /* Reserve send and receive space if it is not reserved yet */ if ((so->so_snd.sb_hiwat == 0) || (so->so_rcv.sb_hiwat == 0)) { error = soreserve(so, NG_BTSOCKET_L2CAP_SENDSPACE, NG_BTSOCKET_L2CAP_RECVSPACE); if (error != 0) return (error); } /* Allocate the PCB */ pcb = malloc(sizeof(*pcb), M_NETGRAPH_BTSOCKET_L2CAP, M_NOWAIT | M_ZERO); if (pcb == NULL) return (ENOMEM); /* Link the PCB and the socket */ so->so_pcb = (caddr_t) pcb; pcb->so = so; pcb->state = NG_BTSOCKET_L2CAP_CLOSED; /* Initialize PCB */ pcb->imtu = pcb->omtu = NG_L2CAP_MTU_DEFAULT; /* Default flow */ pcb->iflow.flags = 0x0; pcb->iflow.service_type = NG_HCI_SERVICE_TYPE_BEST_EFFORT; pcb->iflow.token_rate = 0xffffffff; /* maximum */ pcb->iflow.token_bucket_size = 0xffffffff; /* maximum */ pcb->iflow.peak_bandwidth = 0x00000000; /* maximum */ pcb->iflow.latency = 0xffffffff; /* don't care */ pcb->iflow.delay_variation = 0xffffffff; /* don't care */ bcopy(&pcb->iflow, &pcb->oflow, sizeof(pcb->oflow)); pcb->flush_timo = NG_L2CAP_FLUSH_TIMO_DEFAULT; pcb->link_timo = NG_L2CAP_LINK_TIMO_DEFAULT; /* * XXX Mark PCB mutex as DUPOK to prevent "duplicated lock of * the same type" message. When accepting new L2CAP connection * ng_btsocket_l2cap_process_l2ca_con_ind() holds both PCB mutexes * for "old" (accepting) PCB and "new" (created) PCB. */ mtx_init(&pcb->pcb_mtx, "btsocks_l2cap_pcb_mtx", NULL, MTX_DEF|MTX_DUPOK); callout_init_mtx(&pcb->timo, &pcb->pcb_mtx, 0); /* * Add the PCB to the list * * XXX FIXME VERY IMPORTANT! * * This is totally FUBAR. We could get here in two cases: * * 1) When user calls socket() * 2) When we need to accept new incoming connection and call * sonewconn() * * In the first case we must acquire ng_btsocket_l2cap_sockets_mtx. * In the second case we hold ng_btsocket_l2cap_sockets_mtx already. * So we now need to distinguish between these cases. From reading * /sys/kern/uipc_socket.c we can find out that sonewconn() calls * pru_attach with proto == 0 and td == NULL. For now use this fact * to figure out if we were called from socket() or from sonewconn(). */ if (td != NULL) mtx_lock(&ng_btsocket_l2cap_sockets_mtx); else mtx_assert(&ng_btsocket_l2cap_sockets_mtx, MA_OWNED); /* Set PCB token. Use ng_btsocket_l2cap_sockets_mtx for protection */ if (++ token == 0) token ++; pcb->token = token; LIST_INSERT_HEAD(&ng_btsocket_l2cap_sockets, pcb, next); if (td != NULL) mtx_unlock(&ng_btsocket_l2cap_sockets_mtx); return (0); } /* ng_btsocket_l2cap_attach */ /* * Bind socket */ int ng_btsocket_l2cap_bind(struct socket *so, struct sockaddr *nam, struct thread *td) { ng_btsocket_l2cap_pcb_t *pcb = NULL; struct sockaddr_l2cap *sa = (struct sockaddr_l2cap *) nam; int psm, error = 0; if (ng_btsocket_l2cap_node == NULL) return (EINVAL); /* Verify address */ if (sa == NULL) return (EINVAL); if (sa->l2cap_family != AF_BLUETOOTH) return (EAFNOSUPPORT); /*For the time being, Not support LE binding.*/ if ((sa->l2cap_len != sizeof(*sa))&& (sa->l2cap_len != sizeof(struct sockaddr_l2cap_compat))) return (EINVAL); psm = le16toh(sa->l2cap_psm); /* * Check if other socket has this address already (look for exact * match PSM and bdaddr) and assign socket address if it's available. * * Note: socket can be bound to ANY PSM (zero) thus allowing several * channels with the same PSM between the same pair of BD_ADDR'es. */ mtx_lock(&ng_btsocket_l2cap_sockets_mtx); LIST_FOREACH(pcb, &ng_btsocket_l2cap_sockets, next) if (psm != 0 && psm == pcb->psm && bcmp(&pcb->src, &sa->l2cap_bdaddr, sizeof(bdaddr_t)) == 0) break; if (pcb == NULL) { /* Set socket address */ pcb = so2l2cap_pcb(so); if (pcb != NULL) { bcopy(&sa->l2cap_bdaddr, &pcb->src, sizeof(pcb->src)); pcb->psm = psm; } else error = EINVAL; } else error = EADDRINUSE; mtx_unlock(&ng_btsocket_l2cap_sockets_mtx); return (error); } /* ng_btsocket_l2cap_bind */ /* * Connect socket */ int ng_btsocket_l2cap_connect(struct socket *so, struct sockaddr *nam, struct thread *td) { ng_btsocket_l2cap_pcb_t *pcb = so2l2cap_pcb(so); struct sockaddr_l2cap_compat *sal = (struct sockaddr_l2cap_compat *) nam; struct sockaddr_l2cap *sa = (struct sockaddr_l2cap *)nam; struct sockaddr_l2cap ba; ng_btsocket_l2cap_rtentry_t *rt = NULL; int have_src, error = 0; int idtype = NG_L2CAP_L2CA_IDTYPE_BREDR; /* Check socket */ if (pcb == NULL) return (EINVAL); if (ng_btsocket_l2cap_node == NULL) return (EINVAL); if (pcb->state == NG_BTSOCKET_L2CAP_CONNECTING) return (EINPROGRESS); /* Verify address */ if (sa == NULL) return (EINVAL); if (sa->l2cap_family != AF_BLUETOOTH) return (EAFNOSUPPORT); if (sa->l2cap_len == sizeof(*sal)){ bcopy(sal, &ba, sizeof(*sal)); sa = &ba; sa->l2cap_len = sizeof(*sa); sa->l2cap_bdaddr_type = BDADDR_BREDR; } if (sa->l2cap_len != sizeof(*sa)) return (EINVAL); if ((sa->l2cap_psm && sa->l2cap_cid)) return EINVAL; if (bcmp(&sa->l2cap_bdaddr, NG_HCI_BDADDR_ANY, sizeof(bdaddr_t)) == 0) return (EDESTADDRREQ); if((sa->l2cap_bdaddr_type == BDADDR_BREDR)&& (sa->l2cap_psm == 0)) return EDESTADDRREQ; if(sa->l2cap_bdaddr_type != BDADDR_BREDR){ if(sa->l2cap_cid == NG_L2CAP_ATT_CID){ idtype = NG_L2CAP_L2CA_IDTYPE_ATT; }else if (sa->l2cap_cid == NG_L2CAP_SMP_CID){ idtype =NG_L2CAP_L2CA_IDTYPE_SMP; }else{ //if cid == 0 idtype = NG_L2CAP_L2CA_IDTYPE_LE; // Not supported yet return EINVAL; } } if (pcb->psm != 0 && pcb->psm != le16toh(sa->l2cap_psm)) return (EINVAL); /* * Routing. Socket should be bound to some source address. The source * address can be ANY. Destination address must be set and it must not * be ANY. If source address is ANY then find first rtentry that has * src != dst. */ mtx_lock(&ng_btsocket_l2cap_rt_mtx); mtx_lock(&ng_btsocket_l2cap_sockets_mtx); mtx_lock(&pcb->pcb_mtx); /* Send destination address and PSM */ bcopy(&sa->l2cap_bdaddr, &pcb->dst, sizeof(pcb->dst)); pcb->psm = le16toh(sa->l2cap_psm); pcb->dsttype = sa->l2cap_bdaddr_type; pcb->cid = 0; pcb->idtype = idtype; pcb->rt = NULL; have_src = bcmp(&pcb->src, NG_HCI_BDADDR_ANY, sizeof(pcb->src)); LIST_FOREACH(rt, &ng_btsocket_l2cap_rt, next) { if (rt->hook == NULL || NG_HOOK_NOT_VALID(rt->hook)) continue; /* Match src and dst */ if (have_src) { if (bcmp(&pcb->src, &rt->src, sizeof(rt->src)) == 0) break; } else { if (bcmp(&pcb->dst, &rt->src, sizeof(rt->src)) != 0) break; } } if (rt != NULL) { pcb->rt = rt; if (!have_src){ bcopy(&rt->src, &pcb->src, sizeof(pcb->src)); pcb->srctype = (sa->l2cap_bdaddr_type == BDADDR_BREDR)? BDADDR_BREDR : BDADDR_LE_PUBLIC; } } else error = EHOSTUNREACH; /* * Send L2CA_Connect request */ if (error == 0) { error = ng_btsocket_l2cap_send_l2ca_con_req(pcb); if (error == 0) { pcb->flags |= NG_BTSOCKET_L2CAP_CLIENT; pcb->state = NG_BTSOCKET_L2CAP_CONNECTING; soisconnecting(pcb->so); ng_btsocket_l2cap_timeout(pcb); } } mtx_unlock(&pcb->pcb_mtx); mtx_unlock(&ng_btsocket_l2cap_sockets_mtx); mtx_unlock(&ng_btsocket_l2cap_rt_mtx); return (error); } /* ng_btsocket_l2cap_connect */ /* * Process ioctl's calls on socket */ int ng_btsocket_l2cap_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, struct thread *td) { return (EINVAL); } /* ng_btsocket_l2cap_control */ /* * Process getsockopt/setsockopt system calls */ int ng_btsocket_l2cap_ctloutput(struct socket *so, struct sockopt *sopt) { ng_btsocket_l2cap_pcb_p pcb = so2l2cap_pcb(so); int error = 0; ng_l2cap_cfg_opt_val_t v; if (pcb == NULL) return (EINVAL); if (ng_btsocket_l2cap_node == NULL) return (EINVAL); if (sopt->sopt_level != SOL_L2CAP) return (0); mtx_lock(&pcb->pcb_mtx); switch (sopt->sopt_dir) { case SOPT_GET: switch (sopt->sopt_name) { case SO_L2CAP_IMTU: /* get incoming MTU */ error = sooptcopyout(sopt, &pcb->imtu, sizeof(pcb->imtu)); break; case SO_L2CAP_OMTU: /* get outgoing (peer incoming) MTU */ error = sooptcopyout(sopt, &pcb->omtu, sizeof(pcb->omtu)); break; case SO_L2CAP_IFLOW: /* get incoming flow spec. */ error = sooptcopyout(sopt, &pcb->iflow, sizeof(pcb->iflow)); break; case SO_L2CAP_OFLOW: /* get outgoing flow spec. */ error = sooptcopyout(sopt, &pcb->oflow, sizeof(pcb->oflow)); break; case SO_L2CAP_FLUSH: /* get flush timeout */ error = sooptcopyout(sopt, &pcb->flush_timo, sizeof(pcb->flush_timo)); break; case SO_L2CAP_ENCRYPTED: /* get encrypt required */ error = sooptcopyout(sopt, &pcb->need_encrypt, sizeof(pcb->need_encrypt)); break; default: error = ENOPROTOOPT; break; } break; case SOPT_SET: /* * XXX * We do not allow to change these parameters while socket is * connected or we are in the process of creating a connection. * May be this should indicate re-configuration of the open * channel? */ if (pcb->state != NG_BTSOCKET_L2CAP_CLOSED) { error = EACCES; break; } switch (sopt->sopt_name) { case SO_L2CAP_IMTU: /* set incoming MTU */ error = sooptcopyin(sopt, &v, sizeof(v), sizeof(v.mtu)); if (error == 0) pcb->imtu = v.mtu; break; case SO_L2CAP_OFLOW: /* set outgoing flow spec. */ error = sooptcopyin(sopt, &v, sizeof(v),sizeof(v.flow)); if (error == 0) bcopy(&v.flow, &pcb->oflow, sizeof(pcb->oflow)); break; case SO_L2CAP_FLUSH: /* set flush timeout */ error = sooptcopyin(sopt, &v, sizeof(v), sizeof(v.flush_timo)); if (error == 0) pcb->flush_timo = v.flush_timo; break; case SO_L2CAP_ENCRYPTED: /*set connect encryption opt*/ if((pcb->state != NG_BTSOCKET_L2CAP_OPEN) && (pcb->state != NG_BTSOCKET_L2CAP_W4_ENC_CHANGE)){ error = sooptcopyin(sopt, &v, sizeof(v), sizeof(v.encryption)); if(error == 0) pcb->need_encrypt = (v.encryption)?1:0; }else{ error = EINVAL; } break; default: error = ENOPROTOOPT; break; } break; default: error = EINVAL; break; } mtx_unlock(&pcb->pcb_mtx); return (error); } /* ng_btsocket_l2cap_ctloutput */ /* * Detach and destroy socket */ void ng_btsocket_l2cap_detach(struct socket *so) { ng_btsocket_l2cap_pcb_p pcb = so2l2cap_pcb(so); KASSERT(pcb != NULL, ("ng_btsocket_l2cap_detach: pcb == NULL")); if (ng_btsocket_l2cap_node == NULL) return; mtx_lock(&ng_btsocket_l2cap_sockets_mtx); mtx_lock(&pcb->pcb_mtx); /* XXX what to do with pending request? */ if (pcb->flags & NG_BTSOCKET_L2CAP_TIMO) ng_btsocket_l2cap_untimeout(pcb); if (pcb->state != NG_BTSOCKET_L2CAP_CLOSED && pcb->state != NG_BTSOCKET_L2CAP_DISCONNECTING) /* Send disconnect request with "zero" token */ ng_btsocket_l2cap_send_l2ca_discon_req(0, pcb); pcb->state = NG_BTSOCKET_L2CAP_CLOSED; LIST_REMOVE(pcb, next); mtx_unlock(&pcb->pcb_mtx); mtx_unlock(&ng_btsocket_l2cap_sockets_mtx); mtx_destroy(&pcb->pcb_mtx); bzero(pcb, sizeof(*pcb)); free(pcb, M_NETGRAPH_BTSOCKET_L2CAP); soisdisconnected(so); so->so_pcb = NULL; } /* ng_btsocket_l2cap_detach */ /* * Disconnect socket */ int ng_btsocket_l2cap_disconnect(struct socket *so) { ng_btsocket_l2cap_pcb_p pcb = so2l2cap_pcb(so); int error = 0; if (pcb == NULL) return (EINVAL); if (ng_btsocket_l2cap_node == NULL) return (EINVAL); mtx_lock(&pcb->pcb_mtx); if (pcb->state == NG_BTSOCKET_L2CAP_DISCONNECTING) { mtx_unlock(&pcb->pcb_mtx); return (EINPROGRESS); } if (pcb->state != NG_BTSOCKET_L2CAP_CLOSED) { /* XXX FIXME what to do with pending request? */ if (pcb->flags & NG_BTSOCKET_L2CAP_TIMO) ng_btsocket_l2cap_untimeout(pcb); error = ng_btsocket_l2cap_send_l2ca_discon_req(pcb->token, pcb); if (error == 0) { pcb->state = NG_BTSOCKET_L2CAP_DISCONNECTING; soisdisconnecting(so); ng_btsocket_l2cap_timeout(pcb); } /* XXX FIXME what to do if error != 0 */ } mtx_unlock(&pcb->pcb_mtx); return (error); } /* ng_btsocket_l2cap_disconnect */ /* * Listen on socket */ int ng_btsocket_l2cap_listen(struct socket *so, int backlog, struct thread *td) { ng_btsocket_l2cap_pcb_p pcb = so2l2cap_pcb(so); int error; SOCK_LOCK(so); error = solisten_proto_check(so); if (error != 0) goto out; if (pcb == NULL) { error = EINVAL; goto out; } if (ng_btsocket_l2cap_node == NULL) { error = EINVAL; goto out; } if (pcb->psm == 0) { error = EADDRNOTAVAIL; goto out; } solisten_proto(so, backlog); out: SOCK_UNLOCK(so); return (error); } /* ng_btsocket_listen */ /* * Get peer address */ int ng_btsocket_l2cap_peeraddr(struct socket *so, struct sockaddr **nam) { ng_btsocket_l2cap_pcb_p pcb = so2l2cap_pcb(so); struct sockaddr_l2cap sa; if (pcb == NULL) return (EINVAL); if (ng_btsocket_l2cap_node == NULL) return (EINVAL); bcopy(&pcb->dst, &sa.l2cap_bdaddr, sizeof(sa.l2cap_bdaddr)); sa.l2cap_psm = htole16(pcb->psm); sa.l2cap_len = sizeof(sa); sa.l2cap_family = AF_BLUETOOTH; switch(pcb->idtype){ case NG_L2CAP_L2CA_IDTYPE_ATT: sa.l2cap_cid = NG_L2CAP_ATT_CID; break; case NG_L2CAP_L2CA_IDTYPE_SMP: sa.l2cap_cid = NG_L2CAP_SMP_CID; break; default: sa.l2cap_cid = 0; break; } sa.l2cap_bdaddr_type = pcb->dsttype; *nam = sodupsockaddr((struct sockaddr *) &sa, M_NOWAIT); return ((*nam == NULL)? ENOMEM : 0); } /* ng_btsocket_l2cap_peeraddr */ /* * Send data to socket */ int ng_btsocket_l2cap_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, struct mbuf *control, struct thread *td) { ng_btsocket_l2cap_pcb_t *pcb = so2l2cap_pcb(so); int error = 0; if (ng_btsocket_l2cap_node == NULL) { error = ENETDOWN; goto drop; } /* Check socket and input */ if (pcb == NULL || m == NULL || control != NULL) { error = EINVAL; goto drop; } mtx_lock(&pcb->pcb_mtx); /* Make sure socket is connected */ if (pcb->state != NG_BTSOCKET_L2CAP_OPEN) { mtx_unlock(&pcb->pcb_mtx); error = ENOTCONN; goto drop; } /* Check route */ if (pcb->rt == NULL || pcb->rt->hook == NULL || NG_HOOK_NOT_VALID(pcb->rt->hook)) { mtx_unlock(&pcb->pcb_mtx); error = ENETDOWN; goto drop; } /* Check packet size against outgoing (peer's incoming) MTU) */ if (m->m_pkthdr.len > pcb->omtu) { NG_BTSOCKET_L2CAP_ERR( "%s: Packet too big, len=%d, omtu=%d\n", __func__, m->m_pkthdr.len, pcb->omtu); mtx_unlock(&pcb->pcb_mtx); error = EMSGSIZE; goto drop; } /* * First put packet on socket send queue. Then check if we have * pending timeout. If we do not have timeout then we must send * packet and schedule timeout. Otherwise do nothing and wait for * L2CA_WRITE_RSP. */ sbappendrecord(&pcb->so->so_snd, m); m = NULL; if (!(pcb->flags & NG_BTSOCKET_L2CAP_TIMO)) { error = ng_btsocket_l2cap_send2(pcb); if (error == 0) ng_btsocket_l2cap_timeout(pcb); else sbdroprecord(&pcb->so->so_snd); /* XXX */ } mtx_unlock(&pcb->pcb_mtx); drop: NG_FREE_M(m); /* checks for != NULL */ NG_FREE_M(control); return (error); } /* ng_btsocket_l2cap_send */ /* * Send first packet in the socket queue to the L2CAP layer */ static int ng_btsocket_l2cap_send2(ng_btsocket_l2cap_pcb_p pcb) { struct mbuf *m = NULL; ng_l2cap_l2ca_hdr_t *hdr = NULL; int error = 0; mtx_assert(&pcb->pcb_mtx, MA_OWNED); if (sbavail(&pcb->so->so_snd) == 0) return (EINVAL); /* XXX */ m = m_dup(pcb->so->so_snd.sb_mb, M_NOWAIT); if (m == NULL) return (ENOBUFS); /* Create L2CA packet header */ M_PREPEND(m, sizeof(*hdr), M_NOWAIT); if (m != NULL) if (m->m_len < sizeof(*hdr)) m = m_pullup(m, sizeof(*hdr)); if (m == NULL) { NG_BTSOCKET_L2CAP_ERR( "%s: Failed to create L2CA packet header\n", __func__); return (ENOBUFS); } hdr = mtod(m, ng_l2cap_l2ca_hdr_t *); hdr->token = pcb->token; hdr->length = m->m_pkthdr.len - sizeof(*hdr); hdr->lcid = pcb->cid; hdr->idtype = pcb->idtype; NG_BTSOCKET_L2CAP_INFO( "%s: Sending packet: len=%d, length=%d, lcid=%d, token=%d, state=%d\n", __func__, m->m_pkthdr.len, hdr->length, hdr->lcid, hdr->token, pcb->state); /* * If we got here than we have successfully creates new L2CAP * data packet and now we can send it to the L2CAP layer */ NG_SEND_DATA_ONLY(error, pcb->rt->hook, m); return (error); } /* ng_btsocket_l2cap_send2 */ /* * Get socket address */ int ng_btsocket_l2cap_sockaddr(struct socket *so, struct sockaddr **nam) { ng_btsocket_l2cap_pcb_p pcb = so2l2cap_pcb(so); struct sockaddr_l2cap sa; if (pcb == NULL) return (EINVAL); if (ng_btsocket_l2cap_node == NULL) return (EINVAL); bcopy(&pcb->src, &sa.l2cap_bdaddr, sizeof(sa.l2cap_bdaddr)); sa.l2cap_psm = htole16(pcb->psm); sa.l2cap_len = sizeof(sa); sa.l2cap_family = AF_BLUETOOTH; sa.l2cap_cid = 0; sa.l2cap_bdaddr_type = pcb->srctype; *nam = sodupsockaddr((struct sockaddr *) &sa, M_NOWAIT); return ((*nam == NULL)? ENOMEM : 0); } /* ng_btsocket_l2cap_sockaddr */ /***************************************************************************** ***************************************************************************** ** Misc. functions ***************************************************************************** *****************************************************************************/ /* * Look for the socket that listens on given PSM and bdaddr. Returns exact or * close match (if any). Caller must hold ng_btsocket_l2cap_sockets_mtx. */ static ng_btsocket_l2cap_pcb_p ng_btsocket_l2cap_pcb_by_addr(bdaddr_p bdaddr, int psm) { ng_btsocket_l2cap_pcb_p p = NULL, p1 = NULL; mtx_assert(&ng_btsocket_l2cap_sockets_mtx, MA_OWNED); LIST_FOREACH(p, &ng_btsocket_l2cap_sockets, next) { if (p->so == NULL || !(p->so->so_options & SO_ACCEPTCONN) || p->psm != psm) continue; if (bcmp(&p->src, bdaddr, sizeof(p->src)) == 0) break; if (bcmp(&p->src, NG_HCI_BDADDR_ANY, sizeof(p->src)) == 0) p1 = p; } return ((p != NULL)? p : p1); } /* ng_btsocket_l2cap_pcb_by_addr */ /* * Look for the socket that has given token. * Caller must hold ng_btsocket_l2cap_sockets_mtx. */ static ng_btsocket_l2cap_pcb_p ng_btsocket_l2cap_pcb_by_token(u_int32_t token) { ng_btsocket_l2cap_pcb_p p = NULL; if (token == 0) return (NULL); mtx_assert(&ng_btsocket_l2cap_sockets_mtx, MA_OWNED); LIST_FOREACH(p, &ng_btsocket_l2cap_sockets, next) if (p->token == token) break; return (p); } /* ng_btsocket_l2cap_pcb_by_token */ /* * Look for the socket that assigned to given source address and channel ID. * Caller must hold ng_btsocket_l2cap_sockets_mtx */ static ng_btsocket_l2cap_pcb_p ng_btsocket_l2cap_pcb_by_cid(bdaddr_p src, int cid, int idtype) { ng_btsocket_l2cap_pcb_p p = NULL; mtx_assert(&ng_btsocket_l2cap_sockets_mtx, MA_OWNED); LIST_FOREACH(p, &ng_btsocket_l2cap_sockets, next){ if (p->cid == cid && bcmp(src, &p->src, sizeof(p->src)) == 0&& p->idtype == idtype) break; } return (p); } /* ng_btsocket_l2cap_pcb_by_cid */ /* * Set timeout on socket */ static void ng_btsocket_l2cap_timeout(ng_btsocket_l2cap_pcb_p pcb) { mtx_assert(&pcb->pcb_mtx, MA_OWNED); if (!(pcb->flags & NG_BTSOCKET_L2CAP_TIMO)) { pcb->flags |= NG_BTSOCKET_L2CAP_TIMO; callout_reset(&pcb->timo, bluetooth_l2cap_ertx_timeout(), ng_btsocket_l2cap_process_timeout, pcb); } else KASSERT(0, ("%s: Duplicated socket timeout?!\n", __func__)); } /* ng_btsocket_l2cap_timeout */ /* * Unset timeout on socket */ static void ng_btsocket_l2cap_untimeout(ng_btsocket_l2cap_pcb_p pcb) { mtx_assert(&pcb->pcb_mtx, MA_OWNED); if (pcb->flags & NG_BTSOCKET_L2CAP_TIMO) { callout_stop(&pcb->timo); pcb->flags &= ~NG_BTSOCKET_L2CAP_TIMO; } else KASSERT(0, ("%s: No socket timeout?!\n", __func__)); } /* ng_btsocket_l2cap_untimeout */ /* * Process timeout on socket */ static void ng_btsocket_l2cap_process_timeout(void *xpcb) { ng_btsocket_l2cap_pcb_p pcb = (ng_btsocket_l2cap_pcb_p) xpcb; mtx_assert(&pcb->pcb_mtx, MA_OWNED); pcb->flags &= ~NG_BTSOCKET_L2CAP_TIMO; pcb->so->so_error = ETIMEDOUT; switch (pcb->state) { case NG_BTSOCKET_L2CAP_CONNECTING: case NG_BTSOCKET_L2CAP_CONFIGURING: case NG_BTSOCKET_L2CAP_W4_ENC_CHANGE: /* Send disconnect request with "zero" token */ if (pcb->cid != 0) ng_btsocket_l2cap_send_l2ca_discon_req(0, pcb); /* ... and close the socket */ pcb->state = NG_BTSOCKET_L2CAP_CLOSED; soisdisconnected(pcb->so); break; case NG_BTSOCKET_L2CAP_OPEN: /* Send timeout - drop packet and wakeup sender */ sbdroprecord(&pcb->so->so_snd); sowwakeup(pcb->so); break; case NG_BTSOCKET_L2CAP_DISCONNECTING: /* Disconnect timeout - disconnect the socket anyway */ pcb->state = NG_BTSOCKET_L2CAP_CLOSED; soisdisconnected(pcb->so); break; default: NG_BTSOCKET_L2CAP_ERR( "%s: Invalid socket state=%d\n", __func__, pcb->state); break; } } /* ng_btsocket_l2cap_process_timeout */ /* * Translate HCI/L2CAP error code into "errno" code * XXX Note: Some L2CAP and HCI error codes have the same value, but * different meaning */ static int ng_btsocket_l2cap_result2errno(int result) { switch (result) { case 0x00: /* No error */ return (0); case 0x01: /* Unknown HCI command */ return (ENODEV); case 0x02: /* No connection */ return (ENOTCONN); case 0x03: /* Hardware failure */ return (EIO); case 0x04: /* Page timeout */ return (EHOSTDOWN); case 0x05: /* Authentication failure */ case 0x06: /* Key missing */ case 0x18: /* Pairing not allowed */ case 0x21: /* Role change not allowed */ case 0x24: /* LMP PSU not allowed */ case 0x25: /* Encryption mode not acceptable */ case 0x26: /* Unit key used */ return (EACCES); case 0x07: /* Memory full */ return (ENOMEM); case 0x08: /* Connection timeout */ case 0x10: /* Host timeout */ case 0x22: /* LMP response timeout */ case 0xee: /* HCI timeout */ case 0xeeee: /* L2CAP timeout */ return (ETIMEDOUT); case 0x09: /* Max number of connections */ case 0x0a: /* Max number of SCO connections to a unit */ return (EMLINK); case 0x0b: /* ACL connection already exists */ return (EEXIST); case 0x0c: /* Command disallowed */ return (EBUSY); case 0x0d: /* Host rejected due to limited resources */ case 0x0e: /* Host rejected due to securiity reasons */ case 0x0f: /* Host rejected due to remote unit is a personal unit */ case 0x1b: /* SCO offset rejected */ case 0x1c: /* SCO interval rejected */ case 0x1d: /* SCO air mode rejected */ return (ECONNREFUSED); case 0x11: /* Unsupported feature or parameter value */ case 0x19: /* Unknown LMP PDU */ case 0x1a: /* Unsupported remote feature */ case 0x20: /* Unsupported LMP parameter value */ case 0x27: /* QoS is not supported */ case 0x29: /* Paring with unit key not supported */ return (EOPNOTSUPP); case 0x12: /* Invalid HCI command parameter */ case 0x1e: /* Invalid LMP parameters */ return (EINVAL); case 0x13: /* Other end terminated connection: User ended connection */ case 0x14: /* Other end terminated connection: Low resources */ case 0x15: /* Other end terminated connection: About to power off */ return (ECONNRESET); case 0x16: /* Connection terminated by local host */ return (ECONNABORTED); #if 0 /* XXX not yet */ case 0x17: /* Repeated attempts */ case 0x1f: /* Unspecified error */ case 0x23: /* LMP error transaction collision */ case 0x28: /* Instant passed */ #endif } return (ENOSYS); } /* ng_btsocket_l2cap_result2errno */ Index: head/sys/netgraph/bluetooth/socket/ng_btsocket_rfcomm.c =================================================================== --- head/sys/netgraph/bluetooth/socket/ng_btsocket_rfcomm.c (revision 319721) +++ head/sys/netgraph/bluetooth/socket/ng_btsocket_rfcomm.c (revision 319722) @@ -1,3582 +1,3558 @@ /* * ng_btsocket_rfcomm.c */ /*- * Copyright (c) 2001-2003 Maksim Yevmenkin * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $Id: ng_btsocket_rfcomm.c,v 1.28 2003/09/14 23:29:06 max Exp $ * $FreeBSD$ */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* MALLOC define */ #ifdef NG_SEPARATE_MALLOC static MALLOC_DEFINE(M_NETGRAPH_BTSOCKET_RFCOMM, "netgraph_btsocks_rfcomm", "Netgraph Bluetooth RFCOMM sockets"); #else #define M_NETGRAPH_BTSOCKET_RFCOMM M_NETGRAPH #endif /* NG_SEPARATE_MALLOC */ /* Debug */ #define NG_BTSOCKET_RFCOMM_INFO \ if (ng_btsocket_rfcomm_debug_level >= NG_BTSOCKET_INFO_LEVEL && \ ppsratecheck(&ng_btsocket_rfcomm_lasttime, &ng_btsocket_rfcomm_curpps, 1)) \ printf #define NG_BTSOCKET_RFCOMM_WARN \ if (ng_btsocket_rfcomm_debug_level >= NG_BTSOCKET_WARN_LEVEL && \ ppsratecheck(&ng_btsocket_rfcomm_lasttime, &ng_btsocket_rfcomm_curpps, 1)) \ printf #define NG_BTSOCKET_RFCOMM_ERR \ if (ng_btsocket_rfcomm_debug_level >= NG_BTSOCKET_ERR_LEVEL && \ ppsratecheck(&ng_btsocket_rfcomm_lasttime, &ng_btsocket_rfcomm_curpps, 1)) \ printf #define NG_BTSOCKET_RFCOMM_ALERT \ if (ng_btsocket_rfcomm_debug_level >= NG_BTSOCKET_ALERT_LEVEL && \ ppsratecheck(&ng_btsocket_rfcomm_lasttime, &ng_btsocket_rfcomm_curpps, 1)) \ printf #define ALOT 0x7fff /* Local prototypes */ static int ng_btsocket_rfcomm_upcall (struct socket *so, void *arg, int waitflag); static void ng_btsocket_rfcomm_sessions_task (void *ctx, int pending); static void ng_btsocket_rfcomm_session_task (ng_btsocket_rfcomm_session_p s); #define ng_btsocket_rfcomm_task_wakeup() \ taskqueue_enqueue(taskqueue_swi_giant, &ng_btsocket_rfcomm_task) static ng_btsocket_rfcomm_pcb_p ng_btsocket_rfcomm_connect_ind (ng_btsocket_rfcomm_session_p s, int channel); static void ng_btsocket_rfcomm_connect_cfm (ng_btsocket_rfcomm_session_p s); static int ng_btsocket_rfcomm_session_create (ng_btsocket_rfcomm_session_p *sp, struct socket *l2so, bdaddr_p src, bdaddr_p dst, struct thread *td); static int ng_btsocket_rfcomm_session_accept (ng_btsocket_rfcomm_session_p s0); static int ng_btsocket_rfcomm_session_connect (ng_btsocket_rfcomm_session_p s); static int ng_btsocket_rfcomm_session_receive (ng_btsocket_rfcomm_session_p s); static int ng_btsocket_rfcomm_session_send (ng_btsocket_rfcomm_session_p s); static void ng_btsocket_rfcomm_session_clean (ng_btsocket_rfcomm_session_p s); static void ng_btsocket_rfcomm_session_process_pcb (ng_btsocket_rfcomm_session_p s); static ng_btsocket_rfcomm_session_p ng_btsocket_rfcomm_session_by_addr (bdaddr_p src, bdaddr_p dst); static int ng_btsocket_rfcomm_receive_frame (ng_btsocket_rfcomm_session_p s, struct mbuf *m0); static int ng_btsocket_rfcomm_receive_sabm (ng_btsocket_rfcomm_session_p s, int dlci); static int ng_btsocket_rfcomm_receive_disc (ng_btsocket_rfcomm_session_p s, int dlci); static int ng_btsocket_rfcomm_receive_ua (ng_btsocket_rfcomm_session_p s, int dlci); static int ng_btsocket_rfcomm_receive_dm (ng_btsocket_rfcomm_session_p s, int dlci); static int ng_btsocket_rfcomm_receive_uih (ng_btsocket_rfcomm_session_p s, int dlci, int pf, struct mbuf *m0); static int ng_btsocket_rfcomm_receive_mcc (ng_btsocket_rfcomm_session_p s, struct mbuf *m0); static int ng_btsocket_rfcomm_receive_test (ng_btsocket_rfcomm_session_p s, struct mbuf *m0); static int ng_btsocket_rfcomm_receive_fc (ng_btsocket_rfcomm_session_p s, struct mbuf *m0); static int ng_btsocket_rfcomm_receive_msc (ng_btsocket_rfcomm_session_p s, struct mbuf *m0); static int ng_btsocket_rfcomm_receive_rpn (ng_btsocket_rfcomm_session_p s, struct mbuf *m0); static int ng_btsocket_rfcomm_receive_rls (ng_btsocket_rfcomm_session_p s, struct mbuf *m0); static int ng_btsocket_rfcomm_receive_pn (ng_btsocket_rfcomm_session_p s, struct mbuf *m0); static void ng_btsocket_rfcomm_set_pn (ng_btsocket_rfcomm_pcb_p pcb, u_int8_t cr, u_int8_t flow_control, u_int8_t credits, u_int16_t mtu); static int ng_btsocket_rfcomm_send_command (ng_btsocket_rfcomm_session_p s, u_int8_t type, u_int8_t dlci); static int ng_btsocket_rfcomm_send_uih (ng_btsocket_rfcomm_session_p s, u_int8_t address, u_int8_t pf, u_int8_t credits, struct mbuf *data); static int ng_btsocket_rfcomm_send_msc (ng_btsocket_rfcomm_pcb_p pcb); static int ng_btsocket_rfcomm_send_pn (ng_btsocket_rfcomm_pcb_p pcb); static int ng_btsocket_rfcomm_send_credits (ng_btsocket_rfcomm_pcb_p pcb); static int ng_btsocket_rfcomm_pcb_send (ng_btsocket_rfcomm_pcb_p pcb, int limit); static void ng_btsocket_rfcomm_pcb_kill (ng_btsocket_rfcomm_pcb_p pcb, int error); static ng_btsocket_rfcomm_pcb_p ng_btsocket_rfcomm_pcb_by_dlci (ng_btsocket_rfcomm_session_p s, int dlci); static ng_btsocket_rfcomm_pcb_p ng_btsocket_rfcomm_pcb_listener (bdaddr_p src, int channel); static void ng_btsocket_rfcomm_timeout (ng_btsocket_rfcomm_pcb_p pcb); static void ng_btsocket_rfcomm_untimeout (ng_btsocket_rfcomm_pcb_p pcb); static void ng_btsocket_rfcomm_process_timeout (void *xpcb); static struct mbuf * ng_btsocket_rfcomm_prepare_packet (struct sockbuf *sb, int length); /* Globals */ extern int ifqmaxlen; static u_int32_t ng_btsocket_rfcomm_debug_level; static u_int32_t ng_btsocket_rfcomm_timo; struct task ng_btsocket_rfcomm_task; static LIST_HEAD(, ng_btsocket_rfcomm_session) ng_btsocket_rfcomm_sessions; static struct mtx ng_btsocket_rfcomm_sessions_mtx; static LIST_HEAD(, ng_btsocket_rfcomm_pcb) ng_btsocket_rfcomm_sockets; static struct mtx ng_btsocket_rfcomm_sockets_mtx; static struct timeval ng_btsocket_rfcomm_lasttime; static int ng_btsocket_rfcomm_curpps; /* Sysctl tree */ SYSCTL_DECL(_net_bluetooth_rfcomm_sockets); static SYSCTL_NODE(_net_bluetooth_rfcomm_sockets, OID_AUTO, stream, CTLFLAG_RW, 0, "Bluetooth STREAM RFCOMM sockets family"); SYSCTL_UINT(_net_bluetooth_rfcomm_sockets_stream, OID_AUTO, debug_level, CTLFLAG_RW, &ng_btsocket_rfcomm_debug_level, NG_BTSOCKET_INFO_LEVEL, "Bluetooth STREAM RFCOMM sockets debug level"); SYSCTL_UINT(_net_bluetooth_rfcomm_sockets_stream, OID_AUTO, timeout, CTLFLAG_RW, &ng_btsocket_rfcomm_timo, 60, "Bluetooth STREAM RFCOMM sockets timeout"); /***************************************************************************** ***************************************************************************** ** RFCOMM CRC ***************************************************************************** *****************************************************************************/ static u_int8_t ng_btsocket_rfcomm_crc_table[256] = { 0x00, 0x91, 0xe3, 0x72, 0x07, 0x96, 0xe4, 0x75, 0x0e, 0x9f, 0xed, 0x7c, 0x09, 0x98, 0xea, 0x7b, 0x1c, 0x8d, 0xff, 0x6e, 0x1b, 0x8a, 0xf8, 0x69, 0x12, 0x83, 0xf1, 0x60, 0x15, 0x84, 0xf6, 0x67, 0x38, 0xa9, 0xdb, 0x4a, 0x3f, 0xae, 0xdc, 0x4d, 0x36, 0xa7, 0xd5, 0x44, 0x31, 0xa0, 0xd2, 0x43, 0x24, 0xb5, 0xc7, 0x56, 0x23, 0xb2, 0xc0, 0x51, 0x2a, 0xbb, 0xc9, 0x58, 0x2d, 0xbc, 0xce, 0x5f, 0x70, 0xe1, 0x93, 0x02, 0x77, 0xe6, 0x94, 0x05, 0x7e, 0xef, 0x9d, 0x0c, 0x79, 0xe8, 0x9a, 0x0b, 0x6c, 0xfd, 0x8f, 0x1e, 0x6b, 0xfa, 0x88, 0x19, 0x62, 0xf3, 0x81, 0x10, 0x65, 0xf4, 0x86, 0x17, 0x48, 0xd9, 0xab, 0x3a, 0x4f, 0xde, 0xac, 0x3d, 0x46, 0xd7, 0xa5, 0x34, 0x41, 0xd0, 0xa2, 0x33, 0x54, 0xc5, 0xb7, 0x26, 0x53, 0xc2, 0xb0, 0x21, 0x5a, 0xcb, 0xb9, 0x28, 0x5d, 0xcc, 0xbe, 0x2f, 0xe0, 0x71, 0x03, 0x92, 0xe7, 0x76, 0x04, 0x95, 0xee, 0x7f, 0x0d, 0x9c, 0xe9, 0x78, 0x0a, 0x9b, 0xfc, 0x6d, 0x1f, 0x8e, 0xfb, 0x6a, 0x18, 0x89, 0xf2, 0x63, 0x11, 0x80, 0xf5, 0x64, 0x16, 0x87, 0xd8, 0x49, 0x3b, 0xaa, 0xdf, 0x4e, 0x3c, 0xad, 0xd6, 0x47, 0x35, 0xa4, 0xd1, 0x40, 0x32, 0xa3, 0xc4, 0x55, 0x27, 0xb6, 0xc3, 0x52, 0x20, 0xb1, 0xca, 0x5b, 0x29, 0xb8, 0xcd, 0x5c, 0x2e, 0xbf, 0x90, 0x01, 0x73, 0xe2, 0x97, 0x06, 0x74, 0xe5, 0x9e, 0x0f, 0x7d, 0xec, 0x99, 0x08, 0x7a, 0xeb, 0x8c, 0x1d, 0x6f, 0xfe, 0x8b, 0x1a, 0x68, 0xf9, 0x82, 0x13, 0x61, 0xf0, 0x85, 0x14, 0x66, 0xf7, 0xa8, 0x39, 0x4b, 0xda, 0xaf, 0x3e, 0x4c, 0xdd, 0xa6, 0x37, 0x45, 0xd4, 0xa1, 0x30, 0x42, 0xd3, 0xb4, 0x25, 0x57, 0xc6, 0xb3, 0x22, 0x50, 0xc1, 0xba, 0x2b, 0x59, 0xc8, 0xbd, 0x2c, 0x5e, 0xcf }; /* CRC */ static u_int8_t ng_btsocket_rfcomm_crc(u_int8_t *data, int length) { u_int8_t crc = 0xff; while (length --) crc = ng_btsocket_rfcomm_crc_table[crc ^ *data++]; return (crc); } /* ng_btsocket_rfcomm_crc */ /* FCS on 2 bytes */ static u_int8_t ng_btsocket_rfcomm_fcs2(u_int8_t *data) { return (0xff - ng_btsocket_rfcomm_crc(data, 2)); } /* ng_btsocket_rfcomm_fcs2 */ /* FCS on 3 bytes */ static u_int8_t ng_btsocket_rfcomm_fcs3(u_int8_t *data) { return (0xff - ng_btsocket_rfcomm_crc(data, 3)); } /* ng_btsocket_rfcomm_fcs3 */ /* * Check FCS * * From Bluetooth spec * * "... In 07.10, the frame check sequence (FCS) is calculated on different * sets of fields for different frame types. These are the fields that the * FCS are calculated on: * * For SABM, DISC, UA, DM frames: on Address, Control and length field. * For UIH frames: on Address and Control field. * * (This is stated here for clarification, and to set the standard for RFCOMM; * the fields included in FCS calculation have actually changed in version * 7.0.0 of TS 07.10, but RFCOMM will not change the FCS calculation scheme * from the one above.) ..." */ static int ng_btsocket_rfcomm_check_fcs(u_int8_t *data, int type, u_int8_t fcs) { if (type != RFCOMM_FRAME_UIH) return (ng_btsocket_rfcomm_fcs3(data) != fcs); return (ng_btsocket_rfcomm_fcs2(data) != fcs); } /* ng_btsocket_rfcomm_check_fcs */ /***************************************************************************** ***************************************************************************** ** Socket interface ***************************************************************************** *****************************************************************************/ /* * Initialize everything */ void ng_btsocket_rfcomm_init(void) { /* Skip initialization of globals for non-default instances. */ if (!IS_DEFAULT_VNET(curvnet)) return; ng_btsocket_rfcomm_debug_level = NG_BTSOCKET_WARN_LEVEL; ng_btsocket_rfcomm_timo = 60; /* RFCOMM task */ TASK_INIT(&ng_btsocket_rfcomm_task, 0, ng_btsocket_rfcomm_sessions_task, NULL); /* RFCOMM sessions list */ LIST_INIT(&ng_btsocket_rfcomm_sessions); mtx_init(&ng_btsocket_rfcomm_sessions_mtx, "btsocks_rfcomm_sessions_mtx", NULL, MTX_DEF); /* RFCOMM sockets list */ LIST_INIT(&ng_btsocket_rfcomm_sockets); mtx_init(&ng_btsocket_rfcomm_sockets_mtx, "btsocks_rfcomm_sockets_mtx", NULL, MTX_DEF); } /* ng_btsocket_rfcomm_init */ /* * Abort connection on socket */ void ng_btsocket_rfcomm_abort(struct socket *so) { so->so_error = ECONNABORTED; (void)ng_btsocket_rfcomm_disconnect(so); } /* ng_btsocket_rfcomm_abort */ void ng_btsocket_rfcomm_close(struct socket *so) { (void)ng_btsocket_rfcomm_disconnect(so); } /* ng_btsocket_rfcomm_close */ /* * Accept connection on socket. Nothing to do here, socket must be connected * and ready, so just return peer address and be done with it. */ int ng_btsocket_rfcomm_accept(struct socket *so, struct sockaddr **nam) { return (ng_btsocket_rfcomm_peeraddr(so, nam)); } /* ng_btsocket_rfcomm_accept */ /* * Create and attach new socket */ int ng_btsocket_rfcomm_attach(struct socket *so, int proto, struct thread *td) { ng_btsocket_rfcomm_pcb_p pcb = so2rfcomm_pcb(so); int error; /* Check socket and protocol */ if (so->so_type != SOCK_STREAM) return (ESOCKTNOSUPPORT); #if 0 /* XXX sonewconn() calls "pru_attach" with proto == 0 */ if (proto != 0) if (proto != BLUETOOTH_PROTO_RFCOMM) return (EPROTONOSUPPORT); #endif /* XXX */ if (pcb != NULL) return (EISCONN); /* Reserve send and receive space if it is not reserved yet */ if ((so->so_snd.sb_hiwat == 0) || (so->so_rcv.sb_hiwat == 0)) { error = soreserve(so, NG_BTSOCKET_RFCOMM_SENDSPACE, NG_BTSOCKET_RFCOMM_RECVSPACE); if (error != 0) return (error); } /* Allocate the PCB */ pcb = malloc(sizeof(*pcb), M_NETGRAPH_BTSOCKET_RFCOMM, M_NOWAIT | M_ZERO); if (pcb == NULL) return (ENOMEM); /* Link the PCB and the socket */ so->so_pcb = (caddr_t) pcb; pcb->so = so; /* Initialize PCB */ pcb->state = NG_BTSOCKET_RFCOMM_DLC_CLOSED; pcb->flags = NG_BTSOCKET_RFCOMM_DLC_CFC; pcb->lmodem = pcb->rmodem = (RFCOMM_MODEM_RTC | RFCOMM_MODEM_RTR | RFCOMM_MODEM_DV); pcb->mtu = RFCOMM_DEFAULT_MTU; pcb->tx_cred = 0; pcb->rx_cred = RFCOMM_DEFAULT_CREDITS; mtx_init(&pcb->pcb_mtx, "btsocks_rfcomm_pcb_mtx", NULL, MTX_DEF); callout_init_mtx(&pcb->timo, &pcb->pcb_mtx, 0); /* Add the PCB to the list */ mtx_lock(&ng_btsocket_rfcomm_sockets_mtx); LIST_INSERT_HEAD(&ng_btsocket_rfcomm_sockets, pcb, next); mtx_unlock(&ng_btsocket_rfcomm_sockets_mtx); return (0); } /* ng_btsocket_rfcomm_attach */ /* * Bind socket */ int ng_btsocket_rfcomm_bind(struct socket *so, struct sockaddr *nam, struct thread *td) { ng_btsocket_rfcomm_pcb_t *pcb = so2rfcomm_pcb(so), *pcb1; struct sockaddr_rfcomm *sa = (struct sockaddr_rfcomm *) nam; if (pcb == NULL) return (EINVAL); /* Verify address */ if (sa == NULL) return (EINVAL); if (sa->rfcomm_family != AF_BLUETOOTH) return (EAFNOSUPPORT); if (sa->rfcomm_len != sizeof(*sa)) return (EINVAL); if (sa->rfcomm_channel > 30) return (EINVAL); mtx_lock(&pcb->pcb_mtx); if (sa->rfcomm_channel != 0) { mtx_lock(&ng_btsocket_rfcomm_sockets_mtx); LIST_FOREACH(pcb1, &ng_btsocket_rfcomm_sockets, next) { if (pcb1->channel == sa->rfcomm_channel && bcmp(&pcb1->src, &sa->rfcomm_bdaddr, sizeof(pcb1->src)) == 0) { mtx_unlock(&ng_btsocket_rfcomm_sockets_mtx); mtx_unlock(&pcb->pcb_mtx); return (EADDRINUSE); } } mtx_unlock(&ng_btsocket_rfcomm_sockets_mtx); } bcopy(&sa->rfcomm_bdaddr, &pcb->src, sizeof(pcb->src)); pcb->channel = sa->rfcomm_channel; mtx_unlock(&pcb->pcb_mtx); return (0); } /* ng_btsocket_rfcomm_bind */ /* * Connect socket */ int ng_btsocket_rfcomm_connect(struct socket *so, struct sockaddr *nam, struct thread *td) { ng_btsocket_rfcomm_pcb_t *pcb = so2rfcomm_pcb(so); struct sockaddr_rfcomm *sa = (struct sockaddr_rfcomm *) nam; ng_btsocket_rfcomm_session_t *s = NULL; struct socket *l2so = NULL; int dlci, error = 0; if (pcb == NULL) return (EINVAL); /* Verify address */ if (sa == NULL) return (EINVAL); if (sa->rfcomm_family != AF_BLUETOOTH) return (EAFNOSUPPORT); if (sa->rfcomm_len != sizeof(*sa)) return (EINVAL); if (sa->rfcomm_channel > 30) return (EINVAL); if (sa->rfcomm_channel == 0 || bcmp(&sa->rfcomm_bdaddr, NG_HCI_BDADDR_ANY, sizeof(bdaddr_t)) == 0) return (EDESTADDRREQ); /* * Note that we will not check for errors in socreate() because * if we failed to create L2CAP socket at this point we still * might have already open session. */ error = socreate(PF_BLUETOOTH, &l2so, SOCK_SEQPACKET, BLUETOOTH_PROTO_L2CAP, td->td_ucred, td); /* * Look for session between "pcb->src" and "sa->rfcomm_bdaddr" (dst) */ mtx_lock(&ng_btsocket_rfcomm_sessions_mtx); s = ng_btsocket_rfcomm_session_by_addr(&pcb->src, &sa->rfcomm_bdaddr); if (s == NULL) { /* * We need to create new RFCOMM session. Check if we have L2CAP * socket. If l2so == NULL then error has the error code from * socreate() */ if (l2so == NULL) { mtx_unlock(&ng_btsocket_rfcomm_sessions_mtx); return (error); } error = ng_btsocket_rfcomm_session_create(&s, l2so, &pcb->src, &sa->rfcomm_bdaddr, td); if (error != 0) { mtx_unlock(&ng_btsocket_rfcomm_sessions_mtx); soclose(l2so); return (error); } } else if (l2so != NULL) soclose(l2so); /* we don't need new L2CAP socket */ /* * Check if we already have the same DLCI the same session */ mtx_lock(&s->session_mtx); mtx_lock(&pcb->pcb_mtx); dlci = RFCOMM_MKDLCI(!INITIATOR(s), sa->rfcomm_channel); if (ng_btsocket_rfcomm_pcb_by_dlci(s, dlci) != NULL) { mtx_unlock(&pcb->pcb_mtx); mtx_unlock(&s->session_mtx); mtx_unlock(&ng_btsocket_rfcomm_sessions_mtx); return (EBUSY); } /* * Check session state and if its not acceptable then refuse connection */ switch (s->state) { case NG_BTSOCKET_RFCOMM_SESSION_CONNECTING: case NG_BTSOCKET_RFCOMM_SESSION_CONNECTED: case NG_BTSOCKET_RFCOMM_SESSION_OPEN: /* * Update destination address and channel and attach * DLC to the session */ bcopy(&sa->rfcomm_bdaddr, &pcb->dst, sizeof(pcb->dst)); pcb->channel = sa->rfcomm_channel; pcb->dlci = dlci; LIST_INSERT_HEAD(&s->dlcs, pcb, session_next); pcb->session = s; ng_btsocket_rfcomm_timeout(pcb); soisconnecting(pcb->so); if (s->state == NG_BTSOCKET_RFCOMM_SESSION_OPEN) { pcb->mtu = s->mtu; bcopy(&so2l2cap_pcb(s->l2so)->src, &pcb->src, sizeof(pcb->src)); pcb->state = NG_BTSOCKET_RFCOMM_DLC_CONFIGURING; error = ng_btsocket_rfcomm_send_pn(pcb); if (error == 0) error = ng_btsocket_rfcomm_task_wakeup(); } else pcb->state = NG_BTSOCKET_RFCOMM_DLC_W4_CONNECT; break; default: error = ECONNRESET; break; } mtx_unlock(&pcb->pcb_mtx); mtx_unlock(&s->session_mtx); mtx_unlock(&ng_btsocket_rfcomm_sessions_mtx); return (error); } /* ng_btsocket_rfcomm_connect */ /* * Process ioctl's calls on socket. * XXX FIXME this should provide interface to the RFCOMM multiplexor channel */ int ng_btsocket_rfcomm_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, struct thread *td) { return (EINVAL); } /* ng_btsocket_rfcomm_control */ /* * Process getsockopt/setsockopt system calls */ int ng_btsocket_rfcomm_ctloutput(struct socket *so, struct sockopt *sopt) { ng_btsocket_rfcomm_pcb_p pcb = so2rfcomm_pcb(so); struct ng_btsocket_rfcomm_fc_info fcinfo; int error = 0; if (pcb == NULL) return (EINVAL); if (sopt->sopt_level != SOL_RFCOMM) return (0); mtx_lock(&pcb->pcb_mtx); switch (sopt->sopt_dir) { case SOPT_GET: switch (sopt->sopt_name) { case SO_RFCOMM_MTU: error = sooptcopyout(sopt, &pcb->mtu, sizeof(pcb->mtu)); break; case SO_RFCOMM_FC_INFO: fcinfo.lmodem = pcb->lmodem; fcinfo.rmodem = pcb->rmodem; fcinfo.tx_cred = pcb->tx_cred; fcinfo.rx_cred = pcb->rx_cred; fcinfo.cfc = (pcb->flags & NG_BTSOCKET_RFCOMM_DLC_CFC)? 1 : 0; fcinfo.reserved = 0; error = sooptcopyout(sopt, &fcinfo, sizeof(fcinfo)); break; default: error = ENOPROTOOPT; break; } break; case SOPT_SET: switch (sopt->sopt_name) { default: error = ENOPROTOOPT; break; } break; default: error = EINVAL; break; } mtx_unlock(&pcb->pcb_mtx); return (error); } /* ng_btsocket_rfcomm_ctloutput */ /* * Detach and destroy socket */ void ng_btsocket_rfcomm_detach(struct socket *so) { ng_btsocket_rfcomm_pcb_p pcb = so2rfcomm_pcb(so); KASSERT(pcb != NULL, ("ng_btsocket_rfcomm_detach: pcb == NULL")); mtx_lock(&pcb->pcb_mtx); switch (pcb->state) { case NG_BTSOCKET_RFCOMM_DLC_W4_CONNECT: case NG_BTSOCKET_RFCOMM_DLC_CONFIGURING: case NG_BTSOCKET_RFCOMM_DLC_CONNECTING: case NG_BTSOCKET_RFCOMM_DLC_CONNECTED: /* XXX What to do with pending request? */ if (pcb->flags & NG_BTSOCKET_RFCOMM_DLC_TIMO) ng_btsocket_rfcomm_untimeout(pcb); if (pcb->state == NG_BTSOCKET_RFCOMM_DLC_W4_CONNECT) pcb->flags |= NG_BTSOCKET_RFCOMM_DLC_DETACHED; else pcb->state = NG_BTSOCKET_RFCOMM_DLC_DISCONNECTING; ng_btsocket_rfcomm_task_wakeup(); break; case NG_BTSOCKET_RFCOMM_DLC_DISCONNECTING: ng_btsocket_rfcomm_task_wakeup(); break; } while (pcb->state != NG_BTSOCKET_RFCOMM_DLC_CLOSED) msleep(&pcb->state, &pcb->pcb_mtx, PZERO, "rf_det", 0); if (pcb->session != NULL) panic("%s: pcb->session != NULL\n", __func__); if (pcb->flags & NG_BTSOCKET_RFCOMM_DLC_TIMO) panic("%s: timeout on closed DLC, flags=%#x\n", __func__, pcb->flags); mtx_lock(&ng_btsocket_rfcomm_sockets_mtx); LIST_REMOVE(pcb, next); mtx_unlock(&ng_btsocket_rfcomm_sockets_mtx); mtx_unlock(&pcb->pcb_mtx); mtx_destroy(&pcb->pcb_mtx); bzero(pcb, sizeof(*pcb)); free(pcb, M_NETGRAPH_BTSOCKET_RFCOMM); soisdisconnected(so); so->so_pcb = NULL; } /* ng_btsocket_rfcomm_detach */ /* * Disconnect socket */ int ng_btsocket_rfcomm_disconnect(struct socket *so) { ng_btsocket_rfcomm_pcb_p pcb = so2rfcomm_pcb(so); if (pcb == NULL) return (EINVAL); mtx_lock(&pcb->pcb_mtx); if (pcb->state == NG_BTSOCKET_RFCOMM_DLC_DISCONNECTING) { mtx_unlock(&pcb->pcb_mtx); return (EINPROGRESS); } /* XXX What to do with pending request? */ if (pcb->flags & NG_BTSOCKET_RFCOMM_DLC_TIMO) ng_btsocket_rfcomm_untimeout(pcb); switch (pcb->state) { case NG_BTSOCKET_RFCOMM_DLC_CONFIGURING: /* XXX can we get here? */ case NG_BTSOCKET_RFCOMM_DLC_CONNECTING: /* XXX can we get here? */ case NG_BTSOCKET_RFCOMM_DLC_CONNECTED: /* * Just change DLC state and enqueue RFCOMM task. It will * queue and send DISC on the DLC. */ pcb->state = NG_BTSOCKET_RFCOMM_DLC_DISCONNECTING; soisdisconnecting(so); ng_btsocket_rfcomm_task_wakeup(); break; case NG_BTSOCKET_RFCOMM_DLC_CLOSED: case NG_BTSOCKET_RFCOMM_DLC_W4_CONNECT: break; default: panic("%s: Invalid DLC state=%d, flags=%#x\n", __func__, pcb->state, pcb->flags); break; } mtx_unlock(&pcb->pcb_mtx); return (0); } /* ng_btsocket_rfcomm_disconnect */ /* * Listen on socket. First call to listen() will create listening RFCOMM session */ int ng_btsocket_rfcomm_listen(struct socket *so, int backlog, struct thread *td) { ng_btsocket_rfcomm_pcb_p pcb = so2rfcomm_pcb(so), pcb1; ng_btsocket_rfcomm_session_p s = NULL; struct socket *l2so = NULL; int error, socreate_error, usedchannels; if (pcb == NULL) return (EINVAL); if (pcb->channel > 30) return (EADDRNOTAVAIL); usedchannels = 0; mtx_lock(&pcb->pcb_mtx); if (pcb->channel == 0) { mtx_lock(&ng_btsocket_rfcomm_sockets_mtx); LIST_FOREACH(pcb1, &ng_btsocket_rfcomm_sockets, next) if (pcb1->channel != 0 && bcmp(&pcb1->src, &pcb->src, sizeof(pcb->src)) == 0) usedchannels |= (1 << (pcb1->channel - 1)); for (pcb->channel = 30; pcb->channel > 0; pcb->channel --) if (!(usedchannels & (1 << (pcb->channel - 1)))) break; if (pcb->channel == 0) { mtx_unlock(&ng_btsocket_rfcomm_sockets_mtx); mtx_unlock(&pcb->pcb_mtx); return (EADDRNOTAVAIL); } mtx_unlock(&ng_btsocket_rfcomm_sockets_mtx); } mtx_unlock(&pcb->pcb_mtx); /* * Note that we will not check for errors in socreate() because * if we failed to create L2CAP socket at this point we still * might have already open session. */ socreate_error = socreate(PF_BLUETOOTH, &l2so, SOCK_SEQPACKET, BLUETOOTH_PROTO_L2CAP, td->td_ucred, td); /* * Transition the socket and session into the LISTENING state. Check * for collisions first, as there can only be one. */ mtx_lock(&ng_btsocket_rfcomm_sessions_mtx); SOCK_LOCK(so); error = solisten_proto_check(so); SOCK_UNLOCK(so); if (error != 0) goto out; LIST_FOREACH(s, &ng_btsocket_rfcomm_sessions, next) if (s->state == NG_BTSOCKET_RFCOMM_SESSION_LISTENING) break; if (s == NULL) { /* * We need to create default RFCOMM session. Check if we have * L2CAP socket. If l2so == NULL then error has the error code * from socreate() */ if (l2so == NULL) { error = socreate_error; goto out; } /* * Create default listen RFCOMM session. The default RFCOMM * session will listen on ANY address. * * XXX FIXME Note that currently there is no way to adjust MTU * for the default session. */ error = ng_btsocket_rfcomm_session_create(&s, l2so, NG_HCI_BDADDR_ANY, NULL, td); if (error != 0) goto out; l2so = NULL; } SOCK_LOCK(so); solisten_proto(so, backlog); SOCK_UNLOCK(so); out: mtx_unlock(&ng_btsocket_rfcomm_sessions_mtx); /* * If we still have an l2so reference here, it's unneeded, so release * it. */ if (l2so != NULL) soclose(l2so); return (error); } /* ng_btsocket_listen */ /* * Get peer address */ int ng_btsocket_rfcomm_peeraddr(struct socket *so, struct sockaddr **nam) { ng_btsocket_rfcomm_pcb_p pcb = so2rfcomm_pcb(so); struct sockaddr_rfcomm sa; if (pcb == NULL) return (EINVAL); bcopy(&pcb->dst, &sa.rfcomm_bdaddr, sizeof(sa.rfcomm_bdaddr)); sa.rfcomm_channel = pcb->channel; sa.rfcomm_len = sizeof(sa); sa.rfcomm_family = AF_BLUETOOTH; *nam = sodupsockaddr((struct sockaddr *) &sa, M_NOWAIT); return ((*nam == NULL)? ENOMEM : 0); } /* ng_btsocket_rfcomm_peeraddr */ /* * Send data to socket */ int ng_btsocket_rfcomm_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, struct mbuf *control, struct thread *td) { ng_btsocket_rfcomm_pcb_t *pcb = so2rfcomm_pcb(so); int error = 0; /* Check socket and input */ if (pcb == NULL || m == NULL || control != NULL) { error = EINVAL; goto drop; } mtx_lock(&pcb->pcb_mtx); /* Make sure DLC is connected */ if (pcb->state != NG_BTSOCKET_RFCOMM_DLC_CONNECTED) { mtx_unlock(&pcb->pcb_mtx); error = ENOTCONN; goto drop; } /* Put the packet on the socket's send queue and wakeup RFCOMM task */ sbappend(&pcb->so->so_snd, m, flags); m = NULL; if (!(pcb->flags & NG_BTSOCKET_RFCOMM_DLC_SENDING)) { pcb->flags |= NG_BTSOCKET_RFCOMM_DLC_SENDING; error = ng_btsocket_rfcomm_task_wakeup(); } mtx_unlock(&pcb->pcb_mtx); drop: NG_FREE_M(m); /* checks for != NULL */ NG_FREE_M(control); return (error); } /* ng_btsocket_rfcomm_send */ /* * Get socket address */ int ng_btsocket_rfcomm_sockaddr(struct socket *so, struct sockaddr **nam) { ng_btsocket_rfcomm_pcb_p pcb = so2rfcomm_pcb(so); struct sockaddr_rfcomm sa; if (pcb == NULL) return (EINVAL); bcopy(&pcb->src, &sa.rfcomm_bdaddr, sizeof(sa.rfcomm_bdaddr)); sa.rfcomm_channel = pcb->channel; sa.rfcomm_len = sizeof(sa); sa.rfcomm_family = AF_BLUETOOTH; *nam = sodupsockaddr((struct sockaddr *) &sa, M_NOWAIT); return ((*nam == NULL)? ENOMEM : 0); } /* ng_btsocket_rfcomm_sockaddr */ /* * Upcall function for L2CAP sockets. Enqueue RFCOMM task. */ static int ng_btsocket_rfcomm_upcall(struct socket *so, void *arg, int waitflag) { int error; if (so == NULL) panic("%s: so == NULL\n", __func__); if ((error = ng_btsocket_rfcomm_task_wakeup()) != 0) NG_BTSOCKET_RFCOMM_ALERT( "%s: Could not enqueue RFCOMM task, error=%d\n", __func__, error); return (SU_OK); } /* ng_btsocket_rfcomm_upcall */ /* * RFCOMM task. Will handle all RFCOMM sessions in one pass. * XXX FIXME does not scale very well */ static void ng_btsocket_rfcomm_sessions_task(void *ctx, int pending) { ng_btsocket_rfcomm_session_p s = NULL, s_next = NULL; mtx_lock(&ng_btsocket_rfcomm_sessions_mtx); for (s = LIST_FIRST(&ng_btsocket_rfcomm_sessions); s != NULL; ) { mtx_lock(&s->session_mtx); s_next = LIST_NEXT(s, next); ng_btsocket_rfcomm_session_task(s); if (s->state == NG_BTSOCKET_RFCOMM_SESSION_CLOSED) { /* Unlink and clean the session */ LIST_REMOVE(s, next); NG_BT_MBUFQ_DRAIN(&s->outq); if (!LIST_EMPTY(&s->dlcs)) panic("%s: DLC list is not empty\n", __func__); /* Close L2CAP socket */ SOCKBUF_LOCK(&s->l2so->so_rcv); soupcall_clear(s->l2so, SO_RCV); SOCKBUF_UNLOCK(&s->l2so->so_rcv); SOCKBUF_LOCK(&s->l2so->so_snd); soupcall_clear(s->l2so, SO_SND); SOCKBUF_UNLOCK(&s->l2so->so_snd); soclose(s->l2so); mtx_unlock(&s->session_mtx); mtx_destroy(&s->session_mtx); bzero(s, sizeof(*s)); free(s, M_NETGRAPH_BTSOCKET_RFCOMM); } else mtx_unlock(&s->session_mtx); s = s_next; } mtx_unlock(&ng_btsocket_rfcomm_sessions_mtx); } /* ng_btsocket_rfcomm_sessions_task */ /* * Process RFCOMM session. Will handle all RFCOMM sockets in one pass. */ static void ng_btsocket_rfcomm_session_task(ng_btsocket_rfcomm_session_p s) { mtx_assert(&s->session_mtx, MA_OWNED); if (s->l2so->so_rcv.sb_state & SBS_CANTRCVMORE) { NG_BTSOCKET_RFCOMM_INFO( "%s: L2CAP connection has been terminated, so=%p, so_state=%#x, so_count=%d, " \ "state=%d, flags=%#x\n", __func__, s->l2so, s->l2so->so_state, s->l2so->so_count, s->state, s->flags); s->state = NG_BTSOCKET_RFCOMM_SESSION_CLOSED; ng_btsocket_rfcomm_session_clean(s); } /* Now process upcall */ switch (s->state) { /* Try to accept new L2CAP connection(s) */ case NG_BTSOCKET_RFCOMM_SESSION_LISTENING: while (ng_btsocket_rfcomm_session_accept(s) == 0) ; break; /* Process the results of the L2CAP connect */ case NG_BTSOCKET_RFCOMM_SESSION_CONNECTING: ng_btsocket_rfcomm_session_process_pcb(s); if (ng_btsocket_rfcomm_session_connect(s) != 0) { s->state = NG_BTSOCKET_RFCOMM_SESSION_CLOSED; ng_btsocket_rfcomm_session_clean(s); } break; /* Try to receive/send more data */ case NG_BTSOCKET_RFCOMM_SESSION_CONNECTED: case NG_BTSOCKET_RFCOMM_SESSION_OPEN: case NG_BTSOCKET_RFCOMM_SESSION_DISCONNECTING: ng_btsocket_rfcomm_session_process_pcb(s); if (ng_btsocket_rfcomm_session_receive(s) != 0) { s->state = NG_BTSOCKET_RFCOMM_SESSION_CLOSED; ng_btsocket_rfcomm_session_clean(s); } else if (ng_btsocket_rfcomm_session_send(s) != 0) { s->state = NG_BTSOCKET_RFCOMM_SESSION_CLOSED; ng_btsocket_rfcomm_session_clean(s); } break; case NG_BTSOCKET_RFCOMM_SESSION_CLOSED: break; default: panic("%s: Invalid session state=%d, flags=%#x\n", __func__, s->state, s->flags); break; } } /* ng_btsocket_rfcomm_session_task */ /* * Process RFCOMM connection indicator. Caller must hold s->session_mtx */ static ng_btsocket_rfcomm_pcb_p ng_btsocket_rfcomm_connect_ind(ng_btsocket_rfcomm_session_p s, int channel) { ng_btsocket_rfcomm_pcb_p pcb = NULL, pcb1 = NULL; ng_btsocket_l2cap_pcb_p l2pcb = NULL; - struct socket *so1 = NULL; + struct socket *so1; mtx_assert(&s->session_mtx, MA_OWNED); /* * Try to find RFCOMM socket that listens on given source address * and channel. This will return the best possible match. */ l2pcb = so2l2cap_pcb(s->l2so); pcb = ng_btsocket_rfcomm_pcb_listener(&l2pcb->src, channel); if (pcb == NULL) return (NULL); /* * Check the pending connections queue and if we have space then * create new socket and set proper source and destination address, * and channel. */ mtx_lock(&pcb->pcb_mtx); - if (pcb->so->so_qlen <= pcb->so->so_qlimit) { - CURVNET_SET(pcb->so->so_vnet); - so1 = sonewconn(pcb->so, 0); - CURVNET_RESTORE(); - } + CURVNET_SET(pcb->so->so_vnet); + so1 = sonewconn(pcb->so, 0); + CURVNET_RESTORE(); mtx_unlock(&pcb->pcb_mtx); if (so1 == NULL) return (NULL); /* * If we got here than we have created new socket. So complete the * connection. Set source and destination address from the session. */ pcb1 = so2rfcomm_pcb(so1); if (pcb1 == NULL) panic("%s: pcb1 == NULL\n", __func__); mtx_lock(&pcb1->pcb_mtx); bcopy(&l2pcb->src, &pcb1->src, sizeof(pcb1->src)); bcopy(&l2pcb->dst, &pcb1->dst, sizeof(pcb1->dst)); pcb1->channel = channel; /* Link new DLC to the session. We already hold s->session_mtx */ LIST_INSERT_HEAD(&s->dlcs, pcb1, session_next); pcb1->session = s; mtx_unlock(&pcb1->pcb_mtx); return (pcb1); } /* ng_btsocket_rfcomm_connect_ind */ /* * Process RFCOMM connect confirmation. Caller must hold s->session_mtx. */ static void ng_btsocket_rfcomm_connect_cfm(ng_btsocket_rfcomm_session_p s) { ng_btsocket_rfcomm_pcb_p pcb = NULL, pcb_next = NULL; int error; mtx_assert(&s->session_mtx, MA_OWNED); /* * Wake up all waiting sockets and send PN request for each of them. * Note that timeout already been set in ng_btsocket_rfcomm_connect() * * Note: cannot use LIST_FOREACH because ng_btsocket_rfcomm_pcb_kill * will unlink DLC from the session */ for (pcb = LIST_FIRST(&s->dlcs); pcb != NULL; ) { mtx_lock(&pcb->pcb_mtx); pcb_next = LIST_NEXT(pcb, session_next); if (pcb->state == NG_BTSOCKET_RFCOMM_DLC_W4_CONNECT) { pcb->mtu = s->mtu; bcopy(&so2l2cap_pcb(s->l2so)->src, &pcb->src, sizeof(pcb->src)); error = ng_btsocket_rfcomm_send_pn(pcb); if (error == 0) pcb->state = NG_BTSOCKET_RFCOMM_DLC_CONFIGURING; else ng_btsocket_rfcomm_pcb_kill(pcb, error); } mtx_unlock(&pcb->pcb_mtx); pcb = pcb_next; } } /* ng_btsocket_rfcomm_connect_cfm */ /***************************************************************************** ***************************************************************************** ** RFCOMM sessions ***************************************************************************** *****************************************************************************/ /* * Create new RFCOMM session. That function WILL NOT take ownership over l2so. * Caller MUST free l2so if function failed. */ static int ng_btsocket_rfcomm_session_create(ng_btsocket_rfcomm_session_p *sp, struct socket *l2so, bdaddr_p src, bdaddr_p dst, struct thread *td) { ng_btsocket_rfcomm_session_p s = NULL; struct sockaddr_l2cap l2sa; struct sockopt l2sopt; int error; u_int16_t mtu; mtx_assert(&ng_btsocket_rfcomm_sessions_mtx, MA_OWNED); /* Allocate the RFCOMM session */ s = malloc(sizeof(*s), M_NETGRAPH_BTSOCKET_RFCOMM, M_NOWAIT | M_ZERO); if (s == NULL) return (ENOMEM); /* Set defaults */ s->mtu = RFCOMM_DEFAULT_MTU; s->flags = 0; s->state = NG_BTSOCKET_RFCOMM_SESSION_CLOSED; NG_BT_MBUFQ_INIT(&s->outq, ifqmaxlen); /* * XXX Mark session mutex as DUPOK to prevent "duplicated lock of * the same type" message. When accepting new L2CAP connection * ng_btsocket_rfcomm_session_accept() holds both session mutexes * for "old" (accepting) session and "new" (created) session. */ mtx_init(&s->session_mtx, "btsocks_rfcomm_session_mtx", NULL, MTX_DEF|MTX_DUPOK); LIST_INIT(&s->dlcs); /* Prepare L2CAP socket */ SOCKBUF_LOCK(&l2so->so_rcv); soupcall_set(l2so, SO_RCV, ng_btsocket_rfcomm_upcall, NULL); SOCKBUF_UNLOCK(&l2so->so_rcv); SOCKBUF_LOCK(&l2so->so_snd); soupcall_set(l2so, SO_SND, ng_btsocket_rfcomm_upcall, NULL); SOCKBUF_UNLOCK(&l2so->so_snd); l2so->so_state |= SS_NBIO; s->l2so = l2so; mtx_lock(&s->session_mtx); /* * "src" == NULL and "dst" == NULL means just create session. * caller must do the rest */ if (src == NULL && dst == NULL) goto done; /* * Set incoming MTU on L2CAP socket. It is RFCOMM session default MTU * plus 5 bytes: RFCOMM frame header, one extra byte for length and one * extra byte for credits. */ mtu = s->mtu + sizeof(struct rfcomm_frame_hdr) + 1 + 1; l2sopt.sopt_dir = SOPT_SET; l2sopt.sopt_level = SOL_L2CAP; l2sopt.sopt_name = SO_L2CAP_IMTU; l2sopt.sopt_val = (void *) &mtu; l2sopt.sopt_valsize = sizeof(mtu); l2sopt.sopt_td = NULL; error = sosetopt(s->l2so, &l2sopt); if (error != 0) goto bad; /* Bind socket to "src" address */ l2sa.l2cap_len = sizeof(l2sa); l2sa.l2cap_family = AF_BLUETOOTH; l2sa.l2cap_psm = (dst == NULL)? htole16(NG_L2CAP_PSM_RFCOMM) : 0; bcopy(src, &l2sa.l2cap_bdaddr, sizeof(l2sa.l2cap_bdaddr)); l2sa.l2cap_cid = 0; l2sa.l2cap_bdaddr_type = BDADDR_BREDR; error = sobind(s->l2so, (struct sockaddr *) &l2sa, td); if (error != 0) goto bad; /* If "dst" is not NULL then initiate connect(), otherwise listen() */ if (dst == NULL) { s->flags = 0; s->state = NG_BTSOCKET_RFCOMM_SESSION_LISTENING; error = solisten(s->l2so, 10, td); if (error != 0) goto bad; } else { s->flags = NG_BTSOCKET_RFCOMM_SESSION_INITIATOR; s->state = NG_BTSOCKET_RFCOMM_SESSION_CONNECTING; l2sa.l2cap_len = sizeof(l2sa); l2sa.l2cap_family = AF_BLUETOOTH; l2sa.l2cap_psm = htole16(NG_L2CAP_PSM_RFCOMM); bcopy(dst, &l2sa.l2cap_bdaddr, sizeof(l2sa.l2cap_bdaddr)); l2sa.l2cap_cid = 0; l2sa.l2cap_bdaddr_type = BDADDR_BREDR; error = soconnect(s->l2so, (struct sockaddr *) &l2sa, td); if (error != 0) goto bad; } done: LIST_INSERT_HEAD(&ng_btsocket_rfcomm_sessions, s, next); *sp = s; mtx_unlock(&s->session_mtx); return (0); bad: mtx_unlock(&s->session_mtx); /* Return L2CAP socket back to its original state */ SOCKBUF_LOCK(&l2so->so_rcv); soupcall_clear(s->l2so, SO_RCV); SOCKBUF_UNLOCK(&l2so->so_rcv); SOCKBUF_LOCK(&l2so->so_snd); soupcall_clear(s->l2so, SO_SND); SOCKBUF_UNLOCK(&l2so->so_snd); l2so->so_state &= ~SS_NBIO; mtx_destroy(&s->session_mtx); bzero(s, sizeof(*s)); free(s, M_NETGRAPH_BTSOCKET_RFCOMM); return (error); } /* ng_btsocket_rfcomm_session_create */ /* * Process accept() on RFCOMM session * XXX FIXME locking for "l2so"? */ static int ng_btsocket_rfcomm_session_accept(ng_btsocket_rfcomm_session_p s0) { - struct socket *l2so = NULL; + struct socket *l2so; struct sockaddr_l2cap *l2sa = NULL; ng_btsocket_l2cap_pcb_t *l2pcb = NULL; ng_btsocket_rfcomm_session_p s = NULL; - int error = 0; + int error; mtx_assert(&ng_btsocket_rfcomm_sessions_mtx, MA_OWNED); mtx_assert(&s0->session_mtx, MA_OWNED); - /* Check if there is a complete L2CAP connection in the queue */ - if ((error = s0->l2so->so_error) != 0) { + SOLISTEN_LOCK(s0->l2so); + error = solisten_dequeue(s0->l2so, &l2so, 0); + if (error == EWOULDBLOCK) + return (error); + if (error) { NG_BTSOCKET_RFCOMM_ERR( "%s: Could not accept connection on L2CAP socket, error=%d\n", __func__, error); - s0->l2so->so_error = 0; - return (error); } - - ACCEPT_LOCK(); - if (TAILQ_EMPTY(&s0->l2so->so_comp)) { - ACCEPT_UNLOCK(); - if (s0->l2so->so_rcv.sb_state & SBS_CANTRCVMORE) - return (ECONNABORTED); - return (EWOULDBLOCK); - } - - /* Accept incoming L2CAP connection */ - l2so = TAILQ_FIRST(&s0->l2so->so_comp); - if (l2so == NULL) - panic("%s: l2so == NULL\n", __func__); - - TAILQ_REMOVE(&s0->l2so->so_comp, l2so, so_list); - s0->l2so->so_qlen --; - l2so->so_qstate &= ~SQ_COMP; - l2so->so_head = NULL; - SOCK_LOCK(l2so); - soref(l2so); - l2so->so_state |= SS_NBIO; - SOCK_UNLOCK(l2so); - ACCEPT_UNLOCK(); error = soaccept(l2so, (struct sockaddr **) &l2sa); if (error != 0) { NG_BTSOCKET_RFCOMM_ERR( "%s: soaccept() on L2CAP socket failed, error=%d\n", __func__, error); soclose(l2so); return (error); } /* * Check if there is already active RFCOMM session between two devices. * If so then close L2CAP connection. We only support one RFCOMM session * between each pair of devices. Note that here we assume session in any * state. The session even could be in the middle of disconnecting. */ l2pcb = so2l2cap_pcb(l2so); s = ng_btsocket_rfcomm_session_by_addr(&l2pcb->src, &l2pcb->dst); if (s == NULL) { /* Create a new RFCOMM session */ error = ng_btsocket_rfcomm_session_create(&s, l2so, NULL, NULL, curthread /* XXX */); if (error == 0) { mtx_lock(&s->session_mtx); s->flags = 0; s->state = NG_BTSOCKET_RFCOMM_SESSION_CONNECTED; /* * Adjust MTU on incoming connection. Reserve 5 bytes: * RFCOMM frame header, one extra byte for length and * one extra byte for credits. */ s->mtu = min(l2pcb->imtu, l2pcb->omtu) - sizeof(struct rfcomm_frame_hdr) - 1 - 1; mtx_unlock(&s->session_mtx); } else { NG_BTSOCKET_RFCOMM_ALERT( "%s: Failed to create new RFCOMM session, error=%d\n", __func__, error); soclose(l2so); } } else { NG_BTSOCKET_RFCOMM_WARN( "%s: Rejecting duplicating RFCOMM session between src=%x:%x:%x:%x:%x:%x and " \ "dst=%x:%x:%x:%x:%x:%x, state=%d, flags=%#x\n", __func__, l2pcb->src.b[5], l2pcb->src.b[4], l2pcb->src.b[3], l2pcb->src.b[2], l2pcb->src.b[1], l2pcb->src.b[0], l2pcb->dst.b[5], l2pcb->dst.b[4], l2pcb->dst.b[3], l2pcb->dst.b[2], l2pcb->dst.b[1], l2pcb->dst.b[0], s->state, s->flags); error = EBUSY; soclose(l2so); } return (error); } /* ng_btsocket_rfcomm_session_accept */ /* * Process connect() on RFCOMM session * XXX FIXME locking for "l2so"? */ static int ng_btsocket_rfcomm_session_connect(ng_btsocket_rfcomm_session_p s) { ng_btsocket_l2cap_pcb_p l2pcb = so2l2cap_pcb(s->l2so); int error; mtx_assert(&s->session_mtx, MA_OWNED); /* First check if connection has failed */ if ((error = s->l2so->so_error) != 0) { s->l2so->so_error = 0; NG_BTSOCKET_RFCOMM_ERR( "%s: Could not connect RFCOMM session, error=%d, state=%d, flags=%#x\n", __func__, error, s->state, s->flags); return (error); } /* Is connection still in progress? */ if (s->l2so->so_state & SS_ISCONNECTING) return (0); /* * If we got here then we are connected. Send SABM on DLCI 0 to * open multiplexor channel. */ if (error == 0) { s->state = NG_BTSOCKET_RFCOMM_SESSION_CONNECTED; /* * Adjust MTU on outgoing connection. Reserve 5 bytes: RFCOMM * frame header, one extra byte for length and one extra byte * for credits. */ s->mtu = min(l2pcb->imtu, l2pcb->omtu) - sizeof(struct rfcomm_frame_hdr) - 1 - 1; error = ng_btsocket_rfcomm_send_command(s,RFCOMM_FRAME_SABM,0); if (error == 0) error = ng_btsocket_rfcomm_task_wakeup(); } return (error); }/* ng_btsocket_rfcomm_session_connect */ /* * Receive data on RFCOMM session * XXX FIXME locking for "l2so"? */ static int ng_btsocket_rfcomm_session_receive(ng_btsocket_rfcomm_session_p s) { struct mbuf *m = NULL; struct uio uio; int more, flags, error; mtx_assert(&s->session_mtx, MA_OWNED); /* Can we read from the L2CAP socket? */ if (!soreadable(s->l2so)) return (0); /* First check for error on L2CAP socket */ if ((error = s->l2so->so_error) != 0) { s->l2so->so_error = 0; NG_BTSOCKET_RFCOMM_ERR( "%s: Could not receive data from L2CAP socket, error=%d, state=%d, flags=%#x\n", __func__, error, s->state, s->flags); return (error); } /* * Read all packets from the L2CAP socket. * XXX FIXME/VERIFY is that correct? For now use m->m_nextpkt as * indication that there is more packets on the socket's buffer. * Also what should we use in uio.uio_resid? * May be s->mtu + sizeof(struct rfcomm_frame_hdr) + 1 + 1? */ for (more = 1; more; ) { /* Try to get next packet from socket */ bzero(&uio, sizeof(uio)); /* uio.uio_td = NULL; */ uio.uio_resid = 1000000000; flags = MSG_DONTWAIT; m = NULL; error = soreceive(s->l2so, NULL, &uio, &m, (struct mbuf **) NULL, &flags); if (error != 0) { if (error == EWOULDBLOCK) return (0); /* XXX can happen? */ NG_BTSOCKET_RFCOMM_ERR( "%s: Could not receive data from L2CAP socket, error=%d\n", __func__, error); return (error); } more = (m->m_nextpkt != NULL); m->m_nextpkt = NULL; ng_btsocket_rfcomm_receive_frame(s, m); } return (0); } /* ng_btsocket_rfcomm_session_receive */ /* * Send data on RFCOMM session * XXX FIXME locking for "l2so"? */ static int ng_btsocket_rfcomm_session_send(ng_btsocket_rfcomm_session_p s) { struct mbuf *m = NULL; int error; mtx_assert(&s->session_mtx, MA_OWNED); /* Send as much as we can from the session queue */ while (sowriteable(s->l2so)) { /* Check if socket still OK */ if ((error = s->l2so->so_error) != 0) { s->l2so->so_error = 0; NG_BTSOCKET_RFCOMM_ERR( "%s: Detected error=%d on L2CAP socket, state=%d, flags=%#x\n", __func__, error, s->state, s->flags); return (error); } NG_BT_MBUFQ_DEQUEUE(&s->outq, m); if (m == NULL) return (0); /* we are done */ /* Call send function on the L2CAP socket */ error = (*s->l2so->so_proto->pr_usrreqs->pru_send)(s->l2so, 0, m, NULL, NULL, curthread /* XXX */); if (error != 0) { NG_BTSOCKET_RFCOMM_ERR( "%s: Could not send data to L2CAP socket, error=%d\n", __func__, error); return (error); } } return (0); } /* ng_btsocket_rfcomm_session_send */ /* * Close and disconnect all DLCs for the given session. Caller must hold * s->sesson_mtx. Will wakeup session. */ static void ng_btsocket_rfcomm_session_clean(ng_btsocket_rfcomm_session_p s) { ng_btsocket_rfcomm_pcb_p pcb = NULL, pcb_next = NULL; int error; mtx_assert(&s->session_mtx, MA_OWNED); /* * Note: cannot use LIST_FOREACH because ng_btsocket_rfcomm_pcb_kill * will unlink DLC from the session */ for (pcb = LIST_FIRST(&s->dlcs); pcb != NULL; ) { mtx_lock(&pcb->pcb_mtx); pcb_next = LIST_NEXT(pcb, session_next); NG_BTSOCKET_RFCOMM_INFO( "%s: Disconnecting dlci=%d, state=%d, flags=%#x\n", __func__, pcb->dlci, pcb->state, pcb->flags); if (pcb->state == NG_BTSOCKET_RFCOMM_DLC_CONNECTED) error = ECONNRESET; else error = ECONNREFUSED; ng_btsocket_rfcomm_pcb_kill(pcb, error); mtx_unlock(&pcb->pcb_mtx); pcb = pcb_next; } } /* ng_btsocket_rfcomm_session_clean */ /* * Process all DLCs on the session. Caller MUST hold s->session_mtx. */ static void ng_btsocket_rfcomm_session_process_pcb(ng_btsocket_rfcomm_session_p s) { ng_btsocket_rfcomm_pcb_p pcb = NULL, pcb_next = NULL; int error; mtx_assert(&s->session_mtx, MA_OWNED); /* * Note: cannot use LIST_FOREACH because ng_btsocket_rfcomm_pcb_kill * will unlink DLC from the session */ for (pcb = LIST_FIRST(&s->dlcs); pcb != NULL; ) { mtx_lock(&pcb->pcb_mtx); pcb_next = LIST_NEXT(pcb, session_next); switch (pcb->state) { /* * If DLC in W4_CONNECT state then we should check for both * timeout and detach. */ case NG_BTSOCKET_RFCOMM_DLC_W4_CONNECT: if (pcb->flags & NG_BTSOCKET_RFCOMM_DLC_DETACHED) ng_btsocket_rfcomm_pcb_kill(pcb, 0); else if (pcb->flags & NG_BTSOCKET_RFCOMM_DLC_TIMEDOUT) ng_btsocket_rfcomm_pcb_kill(pcb, ETIMEDOUT); break; /* * If DLC in CONFIGURING or CONNECTING state then we only * should check for timeout. If detach() was called then * DLC will be moved into DISCONNECTING state. */ case NG_BTSOCKET_RFCOMM_DLC_CONFIGURING: case NG_BTSOCKET_RFCOMM_DLC_CONNECTING: if (pcb->flags & NG_BTSOCKET_RFCOMM_DLC_TIMEDOUT) ng_btsocket_rfcomm_pcb_kill(pcb, ETIMEDOUT); break; /* * If DLC in CONNECTED state then we need to send data (if any) * from the socket's send queue. Note that we will send data * from either all sockets or none. This may overload session's * outgoing queue (but we do not check for that). * * XXX FIXME need scheduler for RFCOMM sockets */ case NG_BTSOCKET_RFCOMM_DLC_CONNECTED: error = ng_btsocket_rfcomm_pcb_send(pcb, ALOT); if (error != 0) ng_btsocket_rfcomm_pcb_kill(pcb, error); break; /* * If DLC in DISCONNECTING state then we must send DISC frame. * Note that if DLC has timeout set then we do not need to * resend DISC frame. * * XXX FIXME need to drain all data from the socket's queue * if LINGER option was set */ case NG_BTSOCKET_RFCOMM_DLC_DISCONNECTING: if (!(pcb->flags & NG_BTSOCKET_RFCOMM_DLC_TIMO)) { error = ng_btsocket_rfcomm_send_command( pcb->session, RFCOMM_FRAME_DISC, pcb->dlci); if (error == 0) ng_btsocket_rfcomm_timeout(pcb); else ng_btsocket_rfcomm_pcb_kill(pcb, error); } else if (pcb->flags & NG_BTSOCKET_RFCOMM_DLC_TIMEDOUT) ng_btsocket_rfcomm_pcb_kill(pcb, ETIMEDOUT); break; /* case NG_BTSOCKET_RFCOMM_DLC_CLOSED: */ default: panic("%s: Invalid DLC state=%d, flags=%#x\n", __func__, pcb->state, pcb->flags); break; } mtx_unlock(&pcb->pcb_mtx); pcb = pcb_next; } } /* ng_btsocket_rfcomm_session_process_pcb */ /* * Find RFCOMM session between "src" and "dst". * Caller MUST hold ng_btsocket_rfcomm_sessions_mtx. */ static ng_btsocket_rfcomm_session_p ng_btsocket_rfcomm_session_by_addr(bdaddr_p src, bdaddr_p dst) { ng_btsocket_rfcomm_session_p s = NULL; ng_btsocket_l2cap_pcb_p l2pcb = NULL; int any_src; mtx_assert(&ng_btsocket_rfcomm_sessions_mtx, MA_OWNED); any_src = (bcmp(src, NG_HCI_BDADDR_ANY, sizeof(*src)) == 0); LIST_FOREACH(s, &ng_btsocket_rfcomm_sessions, next) { l2pcb = so2l2cap_pcb(s->l2so); if ((any_src || bcmp(&l2pcb->src, src, sizeof(*src)) == 0) && bcmp(&l2pcb->dst, dst, sizeof(*dst)) == 0) break; } return (s); } /* ng_btsocket_rfcomm_session_by_addr */ /***************************************************************************** ***************************************************************************** ** RFCOMM ***************************************************************************** *****************************************************************************/ /* * Process incoming RFCOMM frame. Caller must hold s->session_mtx. * XXX FIXME check frame length */ static int ng_btsocket_rfcomm_receive_frame(ng_btsocket_rfcomm_session_p s, struct mbuf *m0) { struct rfcomm_frame_hdr *hdr = NULL; struct mbuf *m = NULL; u_int16_t length; u_int8_t dlci, type; int error = 0; mtx_assert(&s->session_mtx, MA_OWNED); /* Pullup as much as we can into first mbuf (for direct access) */ length = min(m0->m_pkthdr.len, MHLEN); if (m0->m_len < length) { if ((m0 = m_pullup(m0, length)) == NULL) { NG_BTSOCKET_RFCOMM_ALERT( "%s: m_pullup(%d) failed\n", __func__, length); return (ENOBUFS); } } hdr = mtod(m0, struct rfcomm_frame_hdr *); dlci = RFCOMM_DLCI(hdr->address); type = RFCOMM_TYPE(hdr->control); /* Test EA bit in length. If not set then we have 2 bytes of length */ if (!RFCOMM_EA(hdr->length)) { bcopy(&hdr->length, &length, sizeof(length)); length = le16toh(length) >> 1; m_adj(m0, sizeof(*hdr) + 1); } else { length = hdr->length >> 1; m_adj(m0, sizeof(*hdr)); } NG_BTSOCKET_RFCOMM_INFO( "%s: Got frame type=%#x, dlci=%d, length=%d, cr=%d, pf=%d, len=%d\n", __func__, type, dlci, length, RFCOMM_CR(hdr->address), RFCOMM_PF(hdr->control), m0->m_pkthdr.len); /* * Get FCS (the last byte in the frame) * XXX this will not work if mbuf chain ends with empty mbuf. * XXX let's hope it never happens :) */ for (m = m0; m->m_next != NULL; m = m->m_next) ; if (m->m_len <= 0) panic("%s: Empty mbuf at the end of the chain, len=%d\n", __func__, m->m_len); /* * Check FCS. We only need to calculate FCS on first 2 or 3 bytes * and already m_pullup'ed mbuf chain, so it should be safe. */ if (ng_btsocket_rfcomm_check_fcs((u_int8_t *) hdr, type, m->m_data[m->m_len - 1])) { NG_BTSOCKET_RFCOMM_ERR( "%s: Invalid RFCOMM packet. Bad checksum\n", __func__); NG_FREE_M(m0); return (EINVAL); } m_adj(m0, -1); /* Trim FCS byte */ /* * Process RFCOMM frame. * * From TS 07.10 spec * * "... In the case where a SABM or DISC command with the P bit set * to 0 is received then the received frame shall be discarded..." * * "... If a unsolicited DM response is received then the frame shall * be processed irrespective of the P/F setting... " * * "... The station may transmit response frames with the F bit set * to 0 at any opportunity on an asynchronous basis. However, in the * case where a UA response is received with the F bit set to 0 then * the received frame shall be discarded..." * * From Bluetooth spec * * "... When credit based flow control is being used, the meaning of * the P/F bit in the control field of the RFCOMM header is redefined * for UIH frames..." */ switch (type) { case RFCOMM_FRAME_SABM: if (RFCOMM_PF(hdr->control)) error = ng_btsocket_rfcomm_receive_sabm(s, dlci); break; case RFCOMM_FRAME_DISC: if (RFCOMM_PF(hdr->control)) error = ng_btsocket_rfcomm_receive_disc(s, dlci); break; case RFCOMM_FRAME_UA: if (RFCOMM_PF(hdr->control)) error = ng_btsocket_rfcomm_receive_ua(s, dlci); break; case RFCOMM_FRAME_DM: error = ng_btsocket_rfcomm_receive_dm(s, dlci); break; case RFCOMM_FRAME_UIH: if (dlci == 0) error = ng_btsocket_rfcomm_receive_mcc(s, m0); else error = ng_btsocket_rfcomm_receive_uih(s, dlci, RFCOMM_PF(hdr->control), m0); return (error); /* NOT REACHED */ default: NG_BTSOCKET_RFCOMM_ERR( "%s: Invalid RFCOMM packet. Unknown type=%#x\n", __func__, type); error = EINVAL; break; } NG_FREE_M(m0); return (error); } /* ng_btsocket_rfcomm_receive_frame */ /* * Process RFCOMM SABM frame */ static int ng_btsocket_rfcomm_receive_sabm(ng_btsocket_rfcomm_session_p s, int dlci) { ng_btsocket_rfcomm_pcb_p pcb = NULL; int error = 0; mtx_assert(&s->session_mtx, MA_OWNED); NG_BTSOCKET_RFCOMM_INFO( "%s: Got SABM, session state=%d, flags=%#x, mtu=%d, dlci=%d\n", __func__, s->state, s->flags, s->mtu, dlci); /* DLCI == 0 means open multiplexor channel */ if (dlci == 0) { switch (s->state) { case NG_BTSOCKET_RFCOMM_SESSION_CONNECTED: case NG_BTSOCKET_RFCOMM_SESSION_OPEN: error = ng_btsocket_rfcomm_send_command(s, RFCOMM_FRAME_UA, dlci); if (error == 0) { s->state = NG_BTSOCKET_RFCOMM_SESSION_OPEN; ng_btsocket_rfcomm_connect_cfm(s); } else { s->state = NG_BTSOCKET_RFCOMM_SESSION_CLOSED; ng_btsocket_rfcomm_session_clean(s); } break; default: NG_BTSOCKET_RFCOMM_WARN( "%s: Got SABM for session in invalid state state=%d, flags=%#x\n", __func__, s->state, s->flags); error = EINVAL; break; } return (error); } /* Make sure multiplexor channel is open */ if (s->state != NG_BTSOCKET_RFCOMM_SESSION_OPEN) { NG_BTSOCKET_RFCOMM_ERR( "%s: Got SABM for dlci=%d with mulitplexor channel closed, state=%d, " \ "flags=%#x\n", __func__, dlci, s->state, s->flags); return (EINVAL); } /* * Check if we have this DLCI. This might happen when remote * peer uses PN command before actual open (SABM) happens. */ pcb = ng_btsocket_rfcomm_pcb_by_dlci(s, dlci); if (pcb != NULL) { mtx_lock(&pcb->pcb_mtx); if (pcb->state != NG_BTSOCKET_RFCOMM_DLC_CONNECTING) { NG_BTSOCKET_RFCOMM_ERR( "%s: Got SABM for dlci=%d in invalid state=%d, flags=%#x\n", __func__, dlci, pcb->state, pcb->flags); mtx_unlock(&pcb->pcb_mtx); return (ENOENT); } ng_btsocket_rfcomm_untimeout(pcb); error = ng_btsocket_rfcomm_send_command(s,RFCOMM_FRAME_UA,dlci); if (error == 0) error = ng_btsocket_rfcomm_send_msc(pcb); if (error == 0) { pcb->state = NG_BTSOCKET_RFCOMM_DLC_CONNECTED; soisconnected(pcb->so); } else ng_btsocket_rfcomm_pcb_kill(pcb, error); mtx_unlock(&pcb->pcb_mtx); return (error); } /* * We do not have requested DLCI, so it must be an incoming connection * with default parameters. Try to accept it. */ pcb = ng_btsocket_rfcomm_connect_ind(s, RFCOMM_SRVCHANNEL(dlci)); if (pcb != NULL) { mtx_lock(&pcb->pcb_mtx); pcb->dlci = dlci; error = ng_btsocket_rfcomm_send_command(s,RFCOMM_FRAME_UA,dlci); if (error == 0) error = ng_btsocket_rfcomm_send_msc(pcb); if (error == 0) { pcb->state = NG_BTSOCKET_RFCOMM_DLC_CONNECTED; soisconnected(pcb->so); } else ng_btsocket_rfcomm_pcb_kill(pcb, error); mtx_unlock(&pcb->pcb_mtx); } else /* Nobody is listen()ing on the requested DLCI */ error = ng_btsocket_rfcomm_send_command(s,RFCOMM_FRAME_DM,dlci); return (error); } /* ng_btsocket_rfcomm_receive_sabm */ /* * Process RFCOMM DISC frame */ static int ng_btsocket_rfcomm_receive_disc(ng_btsocket_rfcomm_session_p s, int dlci) { ng_btsocket_rfcomm_pcb_p pcb = NULL; int error = 0; mtx_assert(&s->session_mtx, MA_OWNED); NG_BTSOCKET_RFCOMM_INFO( "%s: Got DISC, session state=%d, flags=%#x, mtu=%d, dlci=%d\n", __func__, s->state, s->flags, s->mtu, dlci); /* DLCI == 0 means close multiplexor channel */ if (dlci == 0) { /* XXX FIXME assume that remote side will close the socket */ error = ng_btsocket_rfcomm_send_command(s, RFCOMM_FRAME_UA, 0); if (error == 0) { if (s->state == NG_BTSOCKET_RFCOMM_SESSION_DISCONNECTING) s->state = NG_BTSOCKET_RFCOMM_SESSION_CLOSED; /* XXX */ else s->state = NG_BTSOCKET_RFCOMM_SESSION_DISCONNECTING; } else s->state = NG_BTSOCKET_RFCOMM_SESSION_CLOSED; /* XXX */ ng_btsocket_rfcomm_session_clean(s); } else { pcb = ng_btsocket_rfcomm_pcb_by_dlci(s, dlci); if (pcb != NULL) { int err; mtx_lock(&pcb->pcb_mtx); NG_BTSOCKET_RFCOMM_INFO( "%s: Got DISC for dlci=%d, state=%d, flags=%#x\n", __func__, dlci, pcb->state, pcb->flags); error = ng_btsocket_rfcomm_send_command(s, RFCOMM_FRAME_UA, dlci); if (pcb->state == NG_BTSOCKET_RFCOMM_DLC_CONNECTED) err = 0; else err = ECONNREFUSED; ng_btsocket_rfcomm_pcb_kill(pcb, err); mtx_unlock(&pcb->pcb_mtx); } else { NG_BTSOCKET_RFCOMM_WARN( "%s: Got DISC for non-existing dlci=%d\n", __func__, dlci); error = ng_btsocket_rfcomm_send_command(s, RFCOMM_FRAME_DM, dlci); } } return (error); } /* ng_btsocket_rfcomm_receive_disc */ /* * Process RFCOMM UA frame */ static int ng_btsocket_rfcomm_receive_ua(ng_btsocket_rfcomm_session_p s, int dlci) { ng_btsocket_rfcomm_pcb_p pcb = NULL; int error = 0; mtx_assert(&s->session_mtx, MA_OWNED); NG_BTSOCKET_RFCOMM_INFO( "%s: Got UA, session state=%d, flags=%#x, mtu=%d, dlci=%d\n", __func__, s->state, s->flags, s->mtu, dlci); /* dlci == 0 means multiplexor channel */ if (dlci == 0) { switch (s->state) { case NG_BTSOCKET_RFCOMM_SESSION_CONNECTED: s->state = NG_BTSOCKET_RFCOMM_SESSION_OPEN; ng_btsocket_rfcomm_connect_cfm(s); break; case NG_BTSOCKET_RFCOMM_SESSION_DISCONNECTING: s->state = NG_BTSOCKET_RFCOMM_SESSION_CLOSED; ng_btsocket_rfcomm_session_clean(s); break; default: NG_BTSOCKET_RFCOMM_WARN( "%s: Got UA for session in invalid state=%d(%d), flags=%#x, mtu=%d\n", __func__, s->state, INITIATOR(s), s->flags, s->mtu); error = ENOENT; break; } return (error); } /* Check if we have this DLCI */ pcb = ng_btsocket_rfcomm_pcb_by_dlci(s, dlci); if (pcb != NULL) { mtx_lock(&pcb->pcb_mtx); NG_BTSOCKET_RFCOMM_INFO( "%s: Got UA for dlci=%d, state=%d, flags=%#x\n", __func__, dlci, pcb->state, pcb->flags); switch (pcb->state) { case NG_BTSOCKET_RFCOMM_DLC_CONNECTING: ng_btsocket_rfcomm_untimeout(pcb); error = ng_btsocket_rfcomm_send_msc(pcb); if (error == 0) { pcb->state = NG_BTSOCKET_RFCOMM_DLC_CONNECTED; soisconnected(pcb->so); } break; case NG_BTSOCKET_RFCOMM_DLC_DISCONNECTING: ng_btsocket_rfcomm_pcb_kill(pcb, 0); break; default: NG_BTSOCKET_RFCOMM_WARN( "%s: Got UA for dlci=%d in invalid state=%d, flags=%#x\n", __func__, dlci, pcb->state, pcb->flags); error = ENOENT; break; } mtx_unlock(&pcb->pcb_mtx); } else { NG_BTSOCKET_RFCOMM_WARN( "%s: Got UA for non-existing dlci=%d\n", __func__, dlci); error = ng_btsocket_rfcomm_send_command(s,RFCOMM_FRAME_DM,dlci); } return (error); } /* ng_btsocket_rfcomm_receive_ua */ /* * Process RFCOMM DM frame */ static int ng_btsocket_rfcomm_receive_dm(ng_btsocket_rfcomm_session_p s, int dlci) { ng_btsocket_rfcomm_pcb_p pcb = NULL; int error; mtx_assert(&s->session_mtx, MA_OWNED); NG_BTSOCKET_RFCOMM_INFO( "%s: Got DM, session state=%d, flags=%#x, mtu=%d, dlci=%d\n", __func__, s->state, s->flags, s->mtu, dlci); /* DLCI == 0 means multiplexor channel */ if (dlci == 0) { /* Disconnect all dlc's on the session */ s->state = NG_BTSOCKET_RFCOMM_SESSION_CLOSED; ng_btsocket_rfcomm_session_clean(s); } else { pcb = ng_btsocket_rfcomm_pcb_by_dlci(s, dlci); if (pcb != NULL) { mtx_lock(&pcb->pcb_mtx); NG_BTSOCKET_RFCOMM_INFO( "%s: Got DM for dlci=%d, state=%d, flags=%#x\n", __func__, dlci, pcb->state, pcb->flags); if (pcb->state == NG_BTSOCKET_RFCOMM_DLC_CONNECTED) error = ECONNRESET; else error = ECONNREFUSED; ng_btsocket_rfcomm_pcb_kill(pcb, error); mtx_unlock(&pcb->pcb_mtx); } else NG_BTSOCKET_RFCOMM_WARN( "%s: Got DM for non-existing dlci=%d\n", __func__, dlci); } return (0); } /* ng_btsocket_rfcomm_receive_dm */ /* * Process RFCOMM UIH frame (data) */ static int ng_btsocket_rfcomm_receive_uih(ng_btsocket_rfcomm_session_p s, int dlci, int pf, struct mbuf *m0) { ng_btsocket_rfcomm_pcb_p pcb = NULL; int error = 0; mtx_assert(&s->session_mtx, MA_OWNED); NG_BTSOCKET_RFCOMM_INFO( "%s: Got UIH, session state=%d, flags=%#x, mtu=%d, dlci=%d, pf=%d, len=%d\n", __func__, s->state, s->flags, s->mtu, dlci, pf, m0->m_pkthdr.len); /* XXX should we do it here? Check for session flow control */ if (s->flags & NG_BTSOCKET_RFCOMM_SESSION_LFC) { NG_BTSOCKET_RFCOMM_WARN( "%s: Got UIH with session flow control asserted, state=%d, flags=%#x\n", __func__, s->state, s->flags); goto drop; } /* Check if we have this dlci */ pcb = ng_btsocket_rfcomm_pcb_by_dlci(s, dlci); if (pcb == NULL) { NG_BTSOCKET_RFCOMM_WARN( "%s: Got UIH for non-existing dlci=%d\n", __func__, dlci); error = ng_btsocket_rfcomm_send_command(s,RFCOMM_FRAME_DM,dlci); goto drop; } mtx_lock(&pcb->pcb_mtx); /* Check dlci state */ if (pcb->state != NG_BTSOCKET_RFCOMM_DLC_CONNECTED) { NG_BTSOCKET_RFCOMM_WARN( "%s: Got UIH for dlci=%d in invalid state=%d, flags=%#x\n", __func__, dlci, pcb->state, pcb->flags); error = EINVAL; goto drop1; } /* Check dlci flow control */ if (((pcb->flags & NG_BTSOCKET_RFCOMM_DLC_CFC) && pcb->rx_cred <= 0) || (pcb->lmodem & RFCOMM_MODEM_FC)) { NG_BTSOCKET_RFCOMM_ERR( "%s: Got UIH for dlci=%d with asserted flow control, state=%d, " \ "flags=%#x, rx_cred=%d, lmodem=%#x\n", __func__, dlci, pcb->state, pcb->flags, pcb->rx_cred, pcb->lmodem); goto drop1; } /* Did we get any credits? */ if ((pcb->flags & NG_BTSOCKET_RFCOMM_DLC_CFC) && pf) { NG_BTSOCKET_RFCOMM_INFO( "%s: Got %d more credits for dlci=%d, state=%d, flags=%#x, " \ "rx_cred=%d, tx_cred=%d\n", __func__, *mtod(m0, u_int8_t *), dlci, pcb->state, pcb->flags, pcb->rx_cred, pcb->tx_cred); pcb->tx_cred += *mtod(m0, u_int8_t *); m_adj(m0, 1); /* Send more from the DLC. XXX check for errors? */ ng_btsocket_rfcomm_pcb_send(pcb, ALOT); } /* OK the of the rest of the mbuf is the data */ if (m0->m_pkthdr.len > 0) { /* If we are using credit flow control decrease rx_cred here */ if (pcb->flags & NG_BTSOCKET_RFCOMM_DLC_CFC) { /* Give remote peer more credits (if needed) */ if (-- pcb->rx_cred <= RFCOMM_MAX_CREDITS / 2) ng_btsocket_rfcomm_send_credits(pcb); else NG_BTSOCKET_RFCOMM_INFO( "%s: Remote side still has credits, dlci=%d, state=%d, flags=%#x, " \ "rx_cred=%d, tx_cred=%d\n", __func__, dlci, pcb->state, pcb->flags, pcb->rx_cred, pcb->tx_cred); } /* Check packet against mtu on dlci */ if (m0->m_pkthdr.len > pcb->mtu) { NG_BTSOCKET_RFCOMM_ERR( "%s: Got oversized UIH for dlci=%d, state=%d, flags=%#x, mtu=%d, len=%d\n", __func__, dlci, pcb->state, pcb->flags, pcb->mtu, m0->m_pkthdr.len); error = EMSGSIZE; } else if (m0->m_pkthdr.len > sbspace(&pcb->so->so_rcv)) { /* * This is really bad. Receive queue on socket does * not have enough space for the packet. We do not * have any other choice but drop the packet. */ NG_BTSOCKET_RFCOMM_ERR( "%s: Not enough space in socket receive queue. Dropping UIH for dlci=%d, " \ "state=%d, flags=%#x, len=%d, space=%ld\n", __func__, dlci, pcb->state, pcb->flags, m0->m_pkthdr.len, sbspace(&pcb->so->so_rcv)); error = ENOBUFS; } else { /* Append packet to the socket receive queue */ sbappend(&pcb->so->so_rcv, m0, 0); m0 = NULL; sorwakeup(pcb->so); } } drop1: mtx_unlock(&pcb->pcb_mtx); drop: NG_FREE_M(m0); /* checks for != NULL */ return (error); } /* ng_btsocket_rfcomm_receive_uih */ /* * Process RFCOMM MCC command (Multiplexor) * * From TS 07.10 spec * * "5.4.3.1 Information Data * * ...The frames (UIH) sent by the initiating station have the C/R bit set * to 1 and those sent by the responding station have the C/R bit set to 0..." * * "5.4.6.2 Operating procedures * * Messages always exist in pairs; a command message and a corresponding * response message. If the C/R bit is set to 1 the message is a command, * if it is set to 0 the message is a response... * * ... * * NOTE: Notice that when UIH frames are used to convey information on DLCI 0 * there are at least two different fields that contain a C/R bit, and the * bits are set of different form. The C/R bit in the Type field shall be set * as it is stated above, while the C/R bit in the Address field (see subclause * 5.2.1.2) shall be set as it is described in subclause 5.4.3.1." */ static int ng_btsocket_rfcomm_receive_mcc(ng_btsocket_rfcomm_session_p s, struct mbuf *m0) { struct rfcomm_mcc_hdr *hdr = NULL; u_int8_t cr, type, length; mtx_assert(&s->session_mtx, MA_OWNED); /* * We can access data directly in the first mbuf, because we have * m_pullup()'ed mbuf chain in ng_btsocket_rfcomm_receive_frame(). * All MCC commands should fit into single mbuf (except probably TEST). */ hdr = mtod(m0, struct rfcomm_mcc_hdr *); cr = RFCOMM_CR(hdr->type); type = RFCOMM_MCC_TYPE(hdr->type); length = RFCOMM_MCC_LENGTH(hdr->length); /* Check MCC frame length */ if (sizeof(*hdr) + length != m0->m_pkthdr.len) { NG_BTSOCKET_RFCOMM_ERR( "%s: Invalid MCC frame length=%d, len=%d\n", __func__, length, m0->m_pkthdr.len); NG_FREE_M(m0); return (EMSGSIZE); } switch (type) { case RFCOMM_MCC_TEST: return (ng_btsocket_rfcomm_receive_test(s, m0)); /* NOT REACHED */ case RFCOMM_MCC_FCON: case RFCOMM_MCC_FCOFF: return (ng_btsocket_rfcomm_receive_fc(s, m0)); /* NOT REACHED */ case RFCOMM_MCC_MSC: return (ng_btsocket_rfcomm_receive_msc(s, m0)); /* NOT REACHED */ case RFCOMM_MCC_RPN: return (ng_btsocket_rfcomm_receive_rpn(s, m0)); /* NOT REACHED */ case RFCOMM_MCC_RLS: return (ng_btsocket_rfcomm_receive_rls(s, m0)); /* NOT REACHED */ case RFCOMM_MCC_PN: return (ng_btsocket_rfcomm_receive_pn(s, m0)); /* NOT REACHED */ case RFCOMM_MCC_NSC: NG_BTSOCKET_RFCOMM_ERR( "%s: Got MCC NSC, type=%#x, cr=%d, length=%d, session state=%d, flags=%#x, " \ "mtu=%d, len=%d\n", __func__, RFCOMM_MCC_TYPE(*((u_int8_t *)(hdr + 1))), cr, length, s->state, s->flags, s->mtu, m0->m_pkthdr.len); NG_FREE_M(m0); break; default: NG_BTSOCKET_RFCOMM_ERR( "%s: Got unknown MCC, type=%#x, cr=%d, length=%d, session state=%d, " \ "flags=%#x, mtu=%d, len=%d\n", __func__, type, cr, length, s->state, s->flags, s->mtu, m0->m_pkthdr.len); /* Reuse mbuf to send NSC */ hdr = mtod(m0, struct rfcomm_mcc_hdr *); m0->m_pkthdr.len = m0->m_len = sizeof(*hdr); /* Create MCC NSC header */ hdr->type = RFCOMM_MKMCC_TYPE(0, RFCOMM_MCC_NSC); hdr->length = RFCOMM_MKLEN8(1); /* Put back MCC command type we did not like */ m0->m_data[m0->m_len] = RFCOMM_MKMCC_TYPE(cr, type); m0->m_pkthdr.len ++; m0->m_len ++; /* Send UIH frame */ return (ng_btsocket_rfcomm_send_uih(s, RFCOMM_MKADDRESS(INITIATOR(s), 0), 0, 0, m0)); /* NOT REACHED */ } return (0); } /* ng_btsocket_rfcomm_receive_mcc */ /* * Receive RFCOMM TEST MCC command */ static int ng_btsocket_rfcomm_receive_test(ng_btsocket_rfcomm_session_p s, struct mbuf *m0) { struct rfcomm_mcc_hdr *hdr = mtod(m0, struct rfcomm_mcc_hdr *); int error = 0; mtx_assert(&s->session_mtx, MA_OWNED); NG_BTSOCKET_RFCOMM_INFO( "%s: Got MCC TEST, cr=%d, length=%d, session state=%d, flags=%#x, mtu=%d, " \ "len=%d\n", __func__, RFCOMM_CR(hdr->type), RFCOMM_MCC_LENGTH(hdr->length), s->state, s->flags, s->mtu, m0->m_pkthdr.len); if (RFCOMM_CR(hdr->type)) { hdr->type = RFCOMM_MKMCC_TYPE(0, RFCOMM_MCC_TEST); error = ng_btsocket_rfcomm_send_uih(s, RFCOMM_MKADDRESS(INITIATOR(s), 0), 0, 0, m0); } else NG_FREE_M(m0); /* XXX ignore response */ return (error); } /* ng_btsocket_rfcomm_receive_test */ /* * Receive RFCOMM FCON/FCOFF MCC command */ static int ng_btsocket_rfcomm_receive_fc(ng_btsocket_rfcomm_session_p s, struct mbuf *m0) { struct rfcomm_mcc_hdr *hdr = mtod(m0, struct rfcomm_mcc_hdr *); u_int8_t type = RFCOMM_MCC_TYPE(hdr->type); int error = 0; mtx_assert(&s->session_mtx, MA_OWNED); /* * Turn ON/OFF aggregate flow on the entire session. When remote peer * asserted flow control no transmission shall occur except on dlci 0 * (control channel). */ NG_BTSOCKET_RFCOMM_INFO( "%s: Got MCC FC%s, cr=%d, length=%d, session state=%d, flags=%#x, mtu=%d, " \ "len=%d\n", __func__, (type == RFCOMM_MCC_FCON)? "ON" : "OFF", RFCOMM_CR(hdr->type), RFCOMM_MCC_LENGTH(hdr->length), s->state, s->flags, s->mtu, m0->m_pkthdr.len); if (RFCOMM_CR(hdr->type)) { if (type == RFCOMM_MCC_FCON) s->flags &= ~NG_BTSOCKET_RFCOMM_SESSION_RFC; else s->flags |= NG_BTSOCKET_RFCOMM_SESSION_RFC; hdr->type = RFCOMM_MKMCC_TYPE(0, type); error = ng_btsocket_rfcomm_send_uih(s, RFCOMM_MKADDRESS(INITIATOR(s), 0), 0, 0, m0); } else NG_FREE_M(m0); /* XXX ignore response */ return (error); } /* ng_btsocket_rfcomm_receive_fc */ /* * Receive RFCOMM MSC MCC command */ static int ng_btsocket_rfcomm_receive_msc(ng_btsocket_rfcomm_session_p s, struct mbuf *m0) { struct rfcomm_mcc_hdr *hdr = mtod(m0, struct rfcomm_mcc_hdr*); struct rfcomm_mcc_msc *msc = (struct rfcomm_mcc_msc *)(hdr+1); ng_btsocket_rfcomm_pcb_t *pcb = NULL; int error = 0; mtx_assert(&s->session_mtx, MA_OWNED); NG_BTSOCKET_RFCOMM_INFO( "%s: Got MCC MSC, dlci=%d, cr=%d, length=%d, session state=%d, flags=%#x, " \ "mtu=%d, len=%d\n", __func__, RFCOMM_DLCI(msc->address), RFCOMM_CR(hdr->type), RFCOMM_MCC_LENGTH(hdr->length), s->state, s->flags, s->mtu, m0->m_pkthdr.len); if (RFCOMM_CR(hdr->type)) { pcb = ng_btsocket_rfcomm_pcb_by_dlci(s, RFCOMM_DLCI(msc->address)); if (pcb == NULL) { NG_BTSOCKET_RFCOMM_WARN( "%s: Got MSC command for non-existing dlci=%d\n", __func__, RFCOMM_DLCI(msc->address)); NG_FREE_M(m0); return (ENOENT); } mtx_lock(&pcb->pcb_mtx); if (pcb->state != NG_BTSOCKET_RFCOMM_DLC_CONNECTING && pcb->state != NG_BTSOCKET_RFCOMM_DLC_CONNECTED) { NG_BTSOCKET_RFCOMM_WARN( "%s: Got MSC on dlci=%d in invalid state=%d\n", __func__, RFCOMM_DLCI(msc->address), pcb->state); mtx_unlock(&pcb->pcb_mtx); NG_FREE_M(m0); return (EINVAL); } pcb->rmodem = msc->modem; /* Update remote port signals */ hdr->type = RFCOMM_MKMCC_TYPE(0, RFCOMM_MCC_MSC); error = ng_btsocket_rfcomm_send_uih(s, RFCOMM_MKADDRESS(INITIATOR(s), 0), 0, 0, m0); #if 0 /* YYY */ /* Send more data from DLC. XXX check for errors? */ if (!(pcb->rmodem & RFCOMM_MODEM_FC) && !(pcb->flags & NG_BTSOCKET_RFCOMM_DLC_CFC)) ng_btsocket_rfcomm_pcb_send(pcb, ALOT); #endif /* YYY */ mtx_unlock(&pcb->pcb_mtx); } else NG_FREE_M(m0); /* XXX ignore response */ return (error); } /* ng_btsocket_rfcomm_receive_msc */ /* * Receive RFCOMM RPN MCC command * XXX FIXME do we need htole16/le16toh for RPN param_mask? */ static int ng_btsocket_rfcomm_receive_rpn(ng_btsocket_rfcomm_session_p s, struct mbuf *m0) { struct rfcomm_mcc_hdr *hdr = mtod(m0, struct rfcomm_mcc_hdr *); struct rfcomm_mcc_rpn *rpn = (struct rfcomm_mcc_rpn *)(hdr + 1); int error = 0; u_int16_t param_mask; u_int8_t bit_rate, data_bits, stop_bits, parity, flow_control, xon_char, xoff_char; mtx_assert(&s->session_mtx, MA_OWNED); NG_BTSOCKET_RFCOMM_INFO( "%s: Got MCC RPN, dlci=%d, cr=%d, length=%d, session state=%d, flags=%#x, " \ "mtu=%d, len=%d\n", __func__, RFCOMM_DLCI(rpn->dlci), RFCOMM_CR(hdr->type), RFCOMM_MCC_LENGTH(hdr->length), s->state, s->flags, s->mtu, m0->m_pkthdr.len); if (RFCOMM_CR(hdr->type)) { param_mask = RFCOMM_RPN_PM_ALL; if (RFCOMM_MCC_LENGTH(hdr->length) == 1) { /* Request - return default setting */ bit_rate = RFCOMM_RPN_BR_115200; data_bits = RFCOMM_RPN_DATA_8; stop_bits = RFCOMM_RPN_STOP_1; parity = RFCOMM_RPN_PARITY_NONE; flow_control = RFCOMM_RPN_FLOW_NONE; xon_char = RFCOMM_RPN_XON_CHAR; xoff_char = RFCOMM_RPN_XOFF_CHAR; } else { /* * Ignore/accept bit_rate, 8 bits, 1 stop bit, no * parity, no flow control lines, default XON/XOFF * chars. */ bit_rate = rpn->bit_rate; rpn->param_mask = le16toh(rpn->param_mask); /* XXX */ data_bits = RFCOMM_RPN_DATA_BITS(rpn->line_settings); if (rpn->param_mask & RFCOMM_RPN_PM_DATA && data_bits != RFCOMM_RPN_DATA_8) { data_bits = RFCOMM_RPN_DATA_8; param_mask ^= RFCOMM_RPN_PM_DATA; } stop_bits = RFCOMM_RPN_STOP_BITS(rpn->line_settings); if (rpn->param_mask & RFCOMM_RPN_PM_STOP && stop_bits != RFCOMM_RPN_STOP_1) { stop_bits = RFCOMM_RPN_STOP_1; param_mask ^= RFCOMM_RPN_PM_STOP; } parity = RFCOMM_RPN_PARITY(rpn->line_settings); if (rpn->param_mask & RFCOMM_RPN_PM_PARITY && parity != RFCOMM_RPN_PARITY_NONE) { parity = RFCOMM_RPN_PARITY_NONE; param_mask ^= RFCOMM_RPN_PM_PARITY; } flow_control = rpn->flow_control; if (rpn->param_mask & RFCOMM_RPN_PM_FLOW && flow_control != RFCOMM_RPN_FLOW_NONE) { flow_control = RFCOMM_RPN_FLOW_NONE; param_mask ^= RFCOMM_RPN_PM_FLOW; } xon_char = rpn->xon_char; if (rpn->param_mask & RFCOMM_RPN_PM_XON && xon_char != RFCOMM_RPN_XON_CHAR) { xon_char = RFCOMM_RPN_XON_CHAR; param_mask ^= RFCOMM_RPN_PM_XON; } xoff_char = rpn->xoff_char; if (rpn->param_mask & RFCOMM_RPN_PM_XOFF && xoff_char != RFCOMM_RPN_XOFF_CHAR) { xoff_char = RFCOMM_RPN_XOFF_CHAR; param_mask ^= RFCOMM_RPN_PM_XOFF; } } rpn->bit_rate = bit_rate; rpn->line_settings = RFCOMM_MKRPN_LINE_SETTINGS(data_bits, stop_bits, parity); rpn->flow_control = flow_control; rpn->xon_char = xon_char; rpn->xoff_char = xoff_char; rpn->param_mask = htole16(param_mask); /* XXX */ m0->m_pkthdr.len = m0->m_len = sizeof(*hdr) + sizeof(*rpn); hdr->type = RFCOMM_MKMCC_TYPE(0, RFCOMM_MCC_RPN); error = ng_btsocket_rfcomm_send_uih(s, RFCOMM_MKADDRESS(INITIATOR(s), 0), 0, 0, m0); } else NG_FREE_M(m0); /* XXX ignore response */ return (error); } /* ng_btsocket_rfcomm_receive_rpn */ /* * Receive RFCOMM RLS MCC command */ static int ng_btsocket_rfcomm_receive_rls(ng_btsocket_rfcomm_session_p s, struct mbuf *m0) { struct rfcomm_mcc_hdr *hdr = mtod(m0, struct rfcomm_mcc_hdr *); struct rfcomm_mcc_rls *rls = (struct rfcomm_mcc_rls *)(hdr + 1); int error = 0; mtx_assert(&s->session_mtx, MA_OWNED); /* * XXX FIXME Do we have to do anything else here? Remote peer tries to * tell us something about DLCI. Just report what we have received and * return back received values as required by TS 07.10 spec. */ NG_BTSOCKET_RFCOMM_INFO( "%s: Got MCC RLS, dlci=%d, status=%#x, cr=%d, length=%d, session state=%d, " \ "flags=%#x, mtu=%d, len=%d\n", __func__, RFCOMM_DLCI(rls->address), rls->status, RFCOMM_CR(hdr->type), RFCOMM_MCC_LENGTH(hdr->length), s->state, s->flags, s->mtu, m0->m_pkthdr.len); if (RFCOMM_CR(hdr->type)) { if (rls->status & 0x1) NG_BTSOCKET_RFCOMM_ERR( "%s: Got RLS dlci=%d, error=%#x\n", __func__, RFCOMM_DLCI(rls->address), rls->status >> 1); hdr->type = RFCOMM_MKMCC_TYPE(0, RFCOMM_MCC_RLS); error = ng_btsocket_rfcomm_send_uih(s, RFCOMM_MKADDRESS(INITIATOR(s), 0), 0, 0, m0); } else NG_FREE_M(m0); /* XXX ignore responses */ return (error); } /* ng_btsocket_rfcomm_receive_rls */ /* * Receive RFCOMM PN MCC command */ static int ng_btsocket_rfcomm_receive_pn(ng_btsocket_rfcomm_session_p s, struct mbuf *m0) { struct rfcomm_mcc_hdr *hdr = mtod(m0, struct rfcomm_mcc_hdr*); struct rfcomm_mcc_pn *pn = (struct rfcomm_mcc_pn *)(hdr+1); ng_btsocket_rfcomm_pcb_t *pcb = NULL; int error = 0; mtx_assert(&s->session_mtx, MA_OWNED); NG_BTSOCKET_RFCOMM_INFO( "%s: Got MCC PN, dlci=%d, cr=%d, length=%d, flow_control=%#x, priority=%d, " \ "ack_timer=%d, mtu=%d, max_retrans=%d, credits=%d, session state=%d, " \ "flags=%#x, session mtu=%d, len=%d\n", __func__, pn->dlci, RFCOMM_CR(hdr->type), RFCOMM_MCC_LENGTH(hdr->length), pn->flow_control, pn->priority, pn->ack_timer, le16toh(pn->mtu), pn->max_retrans, pn->credits, s->state, s->flags, s->mtu, m0->m_pkthdr.len); if (pn->dlci == 0) { NG_BTSOCKET_RFCOMM_ERR("%s: Zero dlci in MCC PN\n", __func__); NG_FREE_M(m0); return (EINVAL); } /* Check if we have this dlci */ pcb = ng_btsocket_rfcomm_pcb_by_dlci(s, pn->dlci); if (pcb != NULL) { mtx_lock(&pcb->pcb_mtx); if (RFCOMM_CR(hdr->type)) { /* PN Request */ ng_btsocket_rfcomm_set_pn(pcb, 1, pn->flow_control, pn->credits, pn->mtu); if (pcb->flags & NG_BTSOCKET_RFCOMM_DLC_CFC) { pn->flow_control = 0xe0; pn->credits = RFCOMM_DEFAULT_CREDITS; } else { pn->flow_control = 0; pn->credits = 0; } hdr->type = RFCOMM_MKMCC_TYPE(0, RFCOMM_MCC_PN); error = ng_btsocket_rfcomm_send_uih(s, RFCOMM_MKADDRESS(INITIATOR(s), 0), 0, 0, m0); } else { /* PN Response - proceed with SABM. Timeout still set */ if (pcb->state == NG_BTSOCKET_RFCOMM_DLC_CONFIGURING) { ng_btsocket_rfcomm_set_pn(pcb, 0, pn->flow_control, pn->credits, pn->mtu); pcb->state = NG_BTSOCKET_RFCOMM_DLC_CONNECTING; error = ng_btsocket_rfcomm_send_command(s, RFCOMM_FRAME_SABM, pn->dlci); } else NG_BTSOCKET_RFCOMM_WARN( "%s: Got PN response for dlci=%d in invalid state=%d\n", __func__, pn->dlci, pcb->state); NG_FREE_M(m0); } mtx_unlock(&pcb->pcb_mtx); } else if (RFCOMM_CR(hdr->type)) { /* PN request to non-existing dlci - incoming connection */ pcb = ng_btsocket_rfcomm_connect_ind(s, RFCOMM_SRVCHANNEL(pn->dlci)); if (pcb != NULL) { mtx_lock(&pcb->pcb_mtx); pcb->dlci = pn->dlci; ng_btsocket_rfcomm_set_pn(pcb, 1, pn->flow_control, pn->credits, pn->mtu); if (pcb->flags & NG_BTSOCKET_RFCOMM_DLC_CFC) { pn->flow_control = 0xe0; pn->credits = RFCOMM_DEFAULT_CREDITS; } else { pn->flow_control = 0; pn->credits = 0; } hdr->type = RFCOMM_MKMCC_TYPE(0, RFCOMM_MCC_PN); error = ng_btsocket_rfcomm_send_uih(s, RFCOMM_MKADDRESS(INITIATOR(s), 0), 0, 0, m0); if (error == 0) { ng_btsocket_rfcomm_timeout(pcb); pcb->state = NG_BTSOCKET_RFCOMM_DLC_CONNECTING; soisconnecting(pcb->so); } else ng_btsocket_rfcomm_pcb_kill(pcb, error); mtx_unlock(&pcb->pcb_mtx); } else { /* Nobody is listen()ing on this channel */ error = ng_btsocket_rfcomm_send_command(s, RFCOMM_FRAME_DM, pn->dlci); NG_FREE_M(m0); } } else NG_FREE_M(m0); /* XXX ignore response to non-existing dlci */ return (error); } /* ng_btsocket_rfcomm_receive_pn */ /* * Set PN parameters for dlci. Caller must hold pcb->pcb_mtx. * * From Bluetooth spec. * * "... The CL1 - CL4 field is completely redefined. (In TS07.10 this defines * the convergence layer to use, which is not applicable to RFCOMM. In RFCOMM, * in Bluetooth versions up to 1.0B, this field was forced to 0). * * In the PN request sent prior to a DLC establishment, this field must contain * the value 15 (0xF), indicating support of credit based flow control in the * sender. See Table 5.3 below. If the PN response contains any other value * than 14 (0xE) in this field, it is inferred that the peer RFCOMM entity is * not supporting the credit based flow control feature. (This is only possible * if the peer RFCOMM implementation is only conforming to Bluetooth version * 1.0B.) If a PN request is sent on an already open DLC, then this field must * contain the value zero; it is not possible to set initial credits more * than once per DLC activation. A responding implementation must set this * field in the PN response to 14 (0xE), if (and only if) the value in the PN * request was 15..." */ static void ng_btsocket_rfcomm_set_pn(ng_btsocket_rfcomm_pcb_p pcb, u_int8_t cr, u_int8_t flow_control, u_int8_t credits, u_int16_t mtu) { mtx_assert(&pcb->pcb_mtx, MA_OWNED); pcb->mtu = le16toh(mtu); if (cr) { if (flow_control == 0xf0) { pcb->flags |= NG_BTSOCKET_RFCOMM_DLC_CFC; pcb->tx_cred = credits; } else { pcb->flags &= ~NG_BTSOCKET_RFCOMM_DLC_CFC; pcb->tx_cred = 0; } } else { if (flow_control == 0xe0) { pcb->flags |= NG_BTSOCKET_RFCOMM_DLC_CFC; pcb->tx_cred = credits; } else { pcb->flags &= ~NG_BTSOCKET_RFCOMM_DLC_CFC; pcb->tx_cred = 0; } } NG_BTSOCKET_RFCOMM_INFO( "%s: cr=%d, dlci=%d, state=%d, flags=%#x, mtu=%d, rx_cred=%d, tx_cred=%d\n", __func__, cr, pcb->dlci, pcb->state, pcb->flags, pcb->mtu, pcb->rx_cred, pcb->tx_cred); } /* ng_btsocket_rfcomm_set_pn */ /* * Send RFCOMM SABM/DISC/UA/DM frames. Caller must hold s->session_mtx */ static int ng_btsocket_rfcomm_send_command(ng_btsocket_rfcomm_session_p s, u_int8_t type, u_int8_t dlci) { struct rfcomm_cmd_hdr *hdr = NULL; struct mbuf *m = NULL; int cr; mtx_assert(&s->session_mtx, MA_OWNED); NG_BTSOCKET_RFCOMM_INFO( "%s: Sending command type %#x, session state=%d, flags=%#x, mtu=%d, dlci=%d\n", __func__, type, s->state, s->flags, s->mtu, dlci); switch (type) { case RFCOMM_FRAME_SABM: case RFCOMM_FRAME_DISC: cr = INITIATOR(s); break; case RFCOMM_FRAME_UA: case RFCOMM_FRAME_DM: cr = !INITIATOR(s); break; default: panic("%s: Invalid frame type=%#x\n", __func__, type); return (EINVAL); /* NOT REACHED */ } MGETHDR(m, M_NOWAIT, MT_DATA); if (m == NULL) return (ENOBUFS); m->m_pkthdr.len = m->m_len = sizeof(*hdr); hdr = mtod(m, struct rfcomm_cmd_hdr *); hdr->address = RFCOMM_MKADDRESS(cr, dlci); hdr->control = RFCOMM_MKCONTROL(type, 1); hdr->length = RFCOMM_MKLEN8(0); hdr->fcs = ng_btsocket_rfcomm_fcs3((u_int8_t *) hdr); NG_BT_MBUFQ_ENQUEUE(&s->outq, m); return (0); } /* ng_btsocket_rfcomm_send_command */ /* * Send RFCOMM UIH frame. Caller must hold s->session_mtx */ static int ng_btsocket_rfcomm_send_uih(ng_btsocket_rfcomm_session_p s, u_int8_t address, u_int8_t pf, u_int8_t credits, struct mbuf *data) { struct rfcomm_frame_hdr *hdr = NULL; struct mbuf *m = NULL, *mcrc = NULL; u_int16_t length; mtx_assert(&s->session_mtx, MA_OWNED); MGETHDR(m, M_NOWAIT, MT_DATA); if (m == NULL) { NG_FREE_M(data); return (ENOBUFS); } m->m_pkthdr.len = m->m_len = sizeof(*hdr); MGET(mcrc, M_NOWAIT, MT_DATA); if (mcrc == NULL) { NG_FREE_M(data); return (ENOBUFS); } mcrc->m_len = 1; /* Fill UIH frame header */ hdr = mtod(m, struct rfcomm_frame_hdr *); hdr->address = address; hdr->control = RFCOMM_MKCONTROL(RFCOMM_FRAME_UIH, pf); /* Calculate FCS */ mcrc->m_data[0] = ng_btsocket_rfcomm_fcs2((u_int8_t *) hdr); /* Put length back */ length = (data != NULL)? data->m_pkthdr.len : 0; if (length > 127) { u_int16_t l = htole16(RFCOMM_MKLEN16(length)); bcopy(&l, &hdr->length, sizeof(l)); m->m_pkthdr.len ++; m->m_len ++; } else hdr->length = RFCOMM_MKLEN8(length); if (pf) { m->m_data[m->m_len] = credits; m->m_pkthdr.len ++; m->m_len ++; } /* Add payload */ if (data != NULL) { m_cat(m, data); m->m_pkthdr.len += length; } /* Put FCS back */ m_cat(m, mcrc); m->m_pkthdr.len ++; NG_BTSOCKET_RFCOMM_INFO( "%s: Sending UIH state=%d, flags=%#x, address=%d, length=%d, pf=%d, " \ "credits=%d, len=%d\n", __func__, s->state, s->flags, address, length, pf, credits, m->m_pkthdr.len); NG_BT_MBUFQ_ENQUEUE(&s->outq, m); return (0); } /* ng_btsocket_rfcomm_send_uih */ /* * Send MSC request. Caller must hold pcb->pcb_mtx and pcb->session->session_mtx */ static int ng_btsocket_rfcomm_send_msc(ng_btsocket_rfcomm_pcb_p pcb) { struct mbuf *m = NULL; struct rfcomm_mcc_hdr *hdr = NULL; struct rfcomm_mcc_msc *msc = NULL; mtx_assert(&pcb->session->session_mtx, MA_OWNED); mtx_assert(&pcb->pcb_mtx, MA_OWNED); MGETHDR(m, M_NOWAIT, MT_DATA); if (m == NULL) return (ENOBUFS); m->m_pkthdr.len = m->m_len = sizeof(*hdr) + sizeof(*msc); hdr = mtod(m, struct rfcomm_mcc_hdr *); msc = (struct rfcomm_mcc_msc *)(hdr + 1); hdr->type = RFCOMM_MKMCC_TYPE(1, RFCOMM_MCC_MSC); hdr->length = RFCOMM_MKLEN8(sizeof(*msc)); msc->address = RFCOMM_MKADDRESS(1, pcb->dlci); msc->modem = pcb->lmodem; NG_BTSOCKET_RFCOMM_INFO( "%s: Sending MSC dlci=%d, state=%d, flags=%#x, address=%d, modem=%#x\n", __func__, pcb->dlci, pcb->state, pcb->flags, msc->address, msc->modem); return (ng_btsocket_rfcomm_send_uih(pcb->session, RFCOMM_MKADDRESS(INITIATOR(pcb->session), 0), 0, 0, m)); } /* ng_btsocket_rfcomm_send_msc */ /* * Send PN request. Caller must hold pcb->pcb_mtx and pcb->session->session_mtx */ static int ng_btsocket_rfcomm_send_pn(ng_btsocket_rfcomm_pcb_p pcb) { struct mbuf *m = NULL; struct rfcomm_mcc_hdr *hdr = NULL; struct rfcomm_mcc_pn *pn = NULL; mtx_assert(&pcb->session->session_mtx, MA_OWNED); mtx_assert(&pcb->pcb_mtx, MA_OWNED); MGETHDR(m, M_NOWAIT, MT_DATA); if (m == NULL) return (ENOBUFS); m->m_pkthdr.len = m->m_len = sizeof(*hdr) + sizeof(*pn); hdr = mtod(m, struct rfcomm_mcc_hdr *); pn = (struct rfcomm_mcc_pn *)(hdr + 1); hdr->type = RFCOMM_MKMCC_TYPE(1, RFCOMM_MCC_PN); hdr->length = RFCOMM_MKLEN8(sizeof(*pn)); pn->dlci = pcb->dlci; /* * Set default DLCI priority as described in GSM 07.10 * (ETSI TS 101 369) clause 5.6 page 42 */ pn->priority = (pcb->dlci < 56)? (((pcb->dlci >> 3) << 3) + 7) : 61; pn->ack_timer = 0; pn->mtu = htole16(pcb->mtu); pn->max_retrans = 0; if (pcb->flags & NG_BTSOCKET_RFCOMM_DLC_CFC) { pn->flow_control = 0xf0; pn->credits = pcb->rx_cred; } else { pn->flow_control = 0; pn->credits = 0; } NG_BTSOCKET_RFCOMM_INFO( "%s: Sending PN dlci=%d, state=%d, flags=%#x, mtu=%d, flow_control=%#x, " \ "credits=%d\n", __func__, pcb->dlci, pcb->state, pcb->flags, pcb->mtu, pn->flow_control, pn->credits); return (ng_btsocket_rfcomm_send_uih(pcb->session, RFCOMM_MKADDRESS(INITIATOR(pcb->session), 0), 0, 0, m)); } /* ng_btsocket_rfcomm_send_pn */ /* * Calculate and send credits based on available space in receive buffer */ static int ng_btsocket_rfcomm_send_credits(ng_btsocket_rfcomm_pcb_p pcb) { int error = 0; u_int8_t credits; mtx_assert(&pcb->pcb_mtx, MA_OWNED); mtx_assert(&pcb->session->session_mtx, MA_OWNED); NG_BTSOCKET_RFCOMM_INFO( "%s: Sending more credits, dlci=%d, state=%d, flags=%#x, mtu=%d, " \ "space=%ld, tx_cred=%d, rx_cred=%d\n", __func__, pcb->dlci, pcb->state, pcb->flags, pcb->mtu, sbspace(&pcb->so->so_rcv), pcb->tx_cred, pcb->rx_cred); credits = sbspace(&pcb->so->so_rcv) / pcb->mtu; if (credits > 0) { if (pcb->rx_cred + credits > RFCOMM_MAX_CREDITS) credits = RFCOMM_MAX_CREDITS - pcb->rx_cred; error = ng_btsocket_rfcomm_send_uih( pcb->session, RFCOMM_MKADDRESS(INITIATOR(pcb->session), pcb->dlci), 1, credits, NULL); if (error == 0) { pcb->rx_cred += credits; NG_BTSOCKET_RFCOMM_INFO( "%s: Gave remote side %d more credits, dlci=%d, state=%d, flags=%#x, " \ "rx_cred=%d, tx_cred=%d\n", __func__, credits, pcb->dlci, pcb->state, pcb->flags, pcb->rx_cred, pcb->tx_cred); } else NG_BTSOCKET_RFCOMM_ERR( "%s: Could not send credits, error=%d, dlci=%d, state=%d, flags=%#x, " \ "mtu=%d, space=%ld, tx_cred=%d, rx_cred=%d\n", __func__, error, pcb->dlci, pcb->state, pcb->flags, pcb->mtu, sbspace(&pcb->so->so_rcv), pcb->tx_cred, pcb->rx_cred); } return (error); } /* ng_btsocket_rfcomm_send_credits */ /***************************************************************************** ***************************************************************************** ** RFCOMM DLCs ***************************************************************************** *****************************************************************************/ /* * Send data from socket send buffer * Caller must hold pcb->pcb_mtx and pcb->session->session_mtx */ static int ng_btsocket_rfcomm_pcb_send(ng_btsocket_rfcomm_pcb_p pcb, int limit) { struct mbuf *m = NULL; int sent, length, error; mtx_assert(&pcb->session->session_mtx, MA_OWNED); mtx_assert(&pcb->pcb_mtx, MA_OWNED); if (pcb->flags & NG_BTSOCKET_RFCOMM_DLC_CFC) limit = min(limit, pcb->tx_cred); else if (!(pcb->rmodem & RFCOMM_MODEM_FC)) limit = min(limit, RFCOMM_MAX_CREDITS); /* XXX ??? */ else limit = 0; if (limit == 0) { NG_BTSOCKET_RFCOMM_INFO( "%s: Could not send - remote flow control asserted, dlci=%d, flags=%#x, " \ "rmodem=%#x, tx_cred=%d\n", __func__, pcb->dlci, pcb->flags, pcb->rmodem, pcb->tx_cred); return (0); } for (error = 0, sent = 0; sent < limit; sent ++) { length = min(pcb->mtu, sbavail(&pcb->so->so_snd)); if (length == 0) break; /* Get the chunk from the socket's send buffer */ m = ng_btsocket_rfcomm_prepare_packet(&pcb->so->so_snd, length); if (m == NULL) { error = ENOBUFS; break; } sbdrop(&pcb->so->so_snd, length); error = ng_btsocket_rfcomm_send_uih(pcb->session, RFCOMM_MKADDRESS(INITIATOR(pcb->session), pcb->dlci), 0, 0, m); if (error != 0) break; } if (pcb->flags & NG_BTSOCKET_RFCOMM_DLC_CFC) pcb->tx_cred -= sent; if (error == 0 && sent > 0) { pcb->flags &= ~NG_BTSOCKET_RFCOMM_DLC_SENDING; sowwakeup(pcb->so); } return (error); } /* ng_btsocket_rfcomm_pcb_send */ /* * Unlink and disconnect DLC. If ng_btsocket_rfcomm_pcb_kill() returns * non zero value than socket has no reference and has to be detached. * Caller must hold pcb->pcb_mtx and pcb->session->session_mtx */ static void ng_btsocket_rfcomm_pcb_kill(ng_btsocket_rfcomm_pcb_p pcb, int error) { ng_btsocket_rfcomm_session_p s = pcb->session; NG_BTSOCKET_RFCOMM_INFO( "%s: Killing DLC, so=%p, dlci=%d, state=%d, flags=%#x, error=%d\n", __func__, pcb->so, pcb->dlci, pcb->state, pcb->flags, error); if (pcb->session == NULL) panic("%s: DLC without session, pcb=%p, state=%d, flags=%#x\n", __func__, pcb, pcb->state, pcb->flags); mtx_assert(&pcb->session->session_mtx, MA_OWNED); mtx_assert(&pcb->pcb_mtx, MA_OWNED); if (pcb->flags & NG_BTSOCKET_RFCOMM_DLC_TIMO) ng_btsocket_rfcomm_untimeout(pcb); /* Detach DLC from the session. Does not matter which state DLC in */ LIST_REMOVE(pcb, session_next); pcb->session = NULL; /* Change DLC state and wakeup all sleepers */ pcb->state = NG_BTSOCKET_RFCOMM_DLC_CLOSED; pcb->so->so_error = error; soisdisconnected(pcb->so); wakeup(&pcb->state); /* Check if we have any DLCs left on the session */ if (LIST_EMPTY(&s->dlcs) && INITIATOR(s)) { NG_BTSOCKET_RFCOMM_INFO( "%s: Disconnecting session, state=%d, flags=%#x, mtu=%d\n", __func__, s->state, s->flags, s->mtu); switch (s->state) { case NG_BTSOCKET_RFCOMM_SESSION_CLOSED: case NG_BTSOCKET_RFCOMM_SESSION_DISCONNECTING: /* * Do not have to do anything here. We can get here * when L2CAP connection was terminated or we have * received DISC on multiplexor channel */ break; case NG_BTSOCKET_RFCOMM_SESSION_OPEN: /* Send DISC on multiplexor channel */ error = ng_btsocket_rfcomm_send_command(s, RFCOMM_FRAME_DISC, 0); if (error == 0) { s->state = NG_BTSOCKET_RFCOMM_SESSION_DISCONNECTING; break; } /* FALL THROUGH */ case NG_BTSOCKET_RFCOMM_SESSION_CONNECTING: case NG_BTSOCKET_RFCOMM_SESSION_CONNECTED: s->state = NG_BTSOCKET_RFCOMM_SESSION_CLOSED; break; /* case NG_BTSOCKET_RFCOMM_SESSION_LISTENING: */ default: panic("%s: Invalid session state=%d, flags=%#x\n", __func__, s->state, s->flags); break; } ng_btsocket_rfcomm_task_wakeup(); } } /* ng_btsocket_rfcomm_pcb_kill */ /* * Look for given dlci for given RFCOMM session. Caller must hold s->session_mtx */ static ng_btsocket_rfcomm_pcb_p ng_btsocket_rfcomm_pcb_by_dlci(ng_btsocket_rfcomm_session_p s, int dlci) { ng_btsocket_rfcomm_pcb_p pcb = NULL; mtx_assert(&s->session_mtx, MA_OWNED); LIST_FOREACH(pcb, &s->dlcs, session_next) if (pcb->dlci == dlci) break; return (pcb); } /* ng_btsocket_rfcomm_pcb_by_dlci */ /* * Look for socket that listens on given src address and given channel */ static ng_btsocket_rfcomm_pcb_p ng_btsocket_rfcomm_pcb_listener(bdaddr_p src, int channel) { ng_btsocket_rfcomm_pcb_p pcb = NULL, pcb1 = NULL; mtx_lock(&ng_btsocket_rfcomm_sockets_mtx); LIST_FOREACH(pcb, &ng_btsocket_rfcomm_sockets, next) { if (pcb->channel != channel || !(pcb->so->so_options & SO_ACCEPTCONN)) continue; if (bcmp(&pcb->src, src, sizeof(*src)) == 0) break; if (bcmp(&pcb->src, NG_HCI_BDADDR_ANY, sizeof(bdaddr_t)) == 0) pcb1 = pcb; } mtx_unlock(&ng_btsocket_rfcomm_sockets_mtx); return ((pcb != NULL)? pcb : pcb1); } /* ng_btsocket_rfcomm_pcb_listener */ /***************************************************************************** ***************************************************************************** ** Misc. functions ***************************************************************************** *****************************************************************************/ /* * Set timeout. Caller MUST hold pcb_mtx */ static void ng_btsocket_rfcomm_timeout(ng_btsocket_rfcomm_pcb_p pcb) { mtx_assert(&pcb->pcb_mtx, MA_OWNED); if (!(pcb->flags & NG_BTSOCKET_RFCOMM_DLC_TIMO)) { pcb->flags |= NG_BTSOCKET_RFCOMM_DLC_TIMO; pcb->flags &= ~NG_BTSOCKET_RFCOMM_DLC_TIMEDOUT; callout_reset(&pcb->timo, ng_btsocket_rfcomm_timo * hz, ng_btsocket_rfcomm_process_timeout, pcb); } else panic("%s: Duplicated socket timeout?!\n", __func__); } /* ng_btsocket_rfcomm_timeout */ /* * Unset pcb timeout. Caller MUST hold pcb_mtx */ static void ng_btsocket_rfcomm_untimeout(ng_btsocket_rfcomm_pcb_p pcb) { mtx_assert(&pcb->pcb_mtx, MA_OWNED); if (pcb->flags & NG_BTSOCKET_RFCOMM_DLC_TIMO) { callout_stop(&pcb->timo); pcb->flags &= ~NG_BTSOCKET_RFCOMM_DLC_TIMO; pcb->flags &= ~NG_BTSOCKET_RFCOMM_DLC_TIMEDOUT; } else panic("%s: No socket timeout?!\n", __func__); } /* ng_btsocket_rfcomm_timeout */ /* * Process pcb timeout */ static void ng_btsocket_rfcomm_process_timeout(void *xpcb) { ng_btsocket_rfcomm_pcb_p pcb = (ng_btsocket_rfcomm_pcb_p) xpcb; mtx_assert(&pcb->pcb_mtx, MA_OWNED); NG_BTSOCKET_RFCOMM_INFO( "%s: Timeout, so=%p, dlci=%d, state=%d, flags=%#x\n", __func__, pcb->so, pcb->dlci, pcb->state, pcb->flags); pcb->flags &= ~NG_BTSOCKET_RFCOMM_DLC_TIMO; pcb->flags |= NG_BTSOCKET_RFCOMM_DLC_TIMEDOUT; switch (pcb->state) { case NG_BTSOCKET_RFCOMM_DLC_CONFIGURING: case NG_BTSOCKET_RFCOMM_DLC_CONNECTING: pcb->state = NG_BTSOCKET_RFCOMM_DLC_DISCONNECTING; break; case NG_BTSOCKET_RFCOMM_DLC_W4_CONNECT: case NG_BTSOCKET_RFCOMM_DLC_DISCONNECTING: break; default: panic( "%s: DLC timeout in invalid state, dlci=%d, state=%d, flags=%#x\n", __func__, pcb->dlci, pcb->state, pcb->flags); break; } ng_btsocket_rfcomm_task_wakeup(); } /* ng_btsocket_rfcomm_process_timeout */ /* * Get up to length bytes from the socket buffer */ static struct mbuf * ng_btsocket_rfcomm_prepare_packet(struct sockbuf *sb, int length) { struct mbuf *top = NULL, *m = NULL, *n = NULL, *nextpkt = NULL; int mlen, noff, len; MGETHDR(top, M_NOWAIT, MT_DATA); if (top == NULL) return (NULL); top->m_pkthdr.len = length; top->m_len = 0; mlen = MHLEN; m = top; n = sb->sb_mb; nextpkt = n->m_nextpkt; noff = 0; while (length > 0 && n != NULL) { len = min(mlen - m->m_len, n->m_len - noff); if (len > length) len = length; bcopy(mtod(n, caddr_t)+noff, mtod(m, caddr_t)+m->m_len, len); m->m_len += len; noff += len; length -= len; if (length > 0 && m->m_len == mlen) { MGET(m->m_next, M_NOWAIT, MT_DATA); if (m->m_next == NULL) { NG_FREE_M(top); return (NULL); } m = m->m_next; m->m_len = 0; mlen = MLEN; } if (noff == n->m_len) { noff = 0; n = n->m_next; if (n == NULL) n = nextpkt; nextpkt = (n != NULL)? n->m_nextpkt : NULL; } } if (length < 0) panic("%s: length=%d\n", __func__, length); if (length > 0 && n == NULL) panic("%s: bogus length=%d, n=%p\n", __func__, length, n); return (top); } /* ng_btsocket_rfcomm_prepare_packet */ Index: head/sys/netgraph/bluetooth/socket/ng_btsocket_sco.c =================================================================== --- head/sys/netgraph/bluetooth/socket/ng_btsocket_sco.c (revision 319721) +++ head/sys/netgraph/bluetooth/socket/ng_btsocket_sco.c (revision 319722) @@ -1,1987 +1,1980 @@ /* * ng_btsocket_sco.c */ /*- * Copyright (c) 2001-2002 Maksim Yevmenkin * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $Id: ng_btsocket_sco.c,v 1.2 2005/10/31 18:08:51 max Exp $ * $FreeBSD$ */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* MALLOC define */ #ifdef NG_SEPARATE_MALLOC static MALLOC_DEFINE(M_NETGRAPH_BTSOCKET_SCO, "netgraph_btsocks_sco", "Netgraph Bluetooth SCO sockets"); #else #define M_NETGRAPH_BTSOCKET_SCO M_NETGRAPH #endif /* NG_SEPARATE_MALLOC */ /* Netgraph node methods */ static ng_constructor_t ng_btsocket_sco_node_constructor; static ng_rcvmsg_t ng_btsocket_sco_node_rcvmsg; static ng_shutdown_t ng_btsocket_sco_node_shutdown; static ng_newhook_t ng_btsocket_sco_node_newhook; static ng_connect_t ng_btsocket_sco_node_connect; static ng_rcvdata_t ng_btsocket_sco_node_rcvdata; static ng_disconnect_t ng_btsocket_sco_node_disconnect; static void ng_btsocket_sco_input (void *, int); static void ng_btsocket_sco_rtclean (void *, int); /* Netgraph type descriptor */ static struct ng_type typestruct = { .version = NG_ABI_VERSION, .name = NG_BTSOCKET_SCO_NODE_TYPE, .constructor = ng_btsocket_sco_node_constructor, .rcvmsg = ng_btsocket_sco_node_rcvmsg, .shutdown = ng_btsocket_sco_node_shutdown, .newhook = ng_btsocket_sco_node_newhook, .connect = ng_btsocket_sco_node_connect, .rcvdata = ng_btsocket_sco_node_rcvdata, .disconnect = ng_btsocket_sco_node_disconnect, }; /* Globals */ static u_int32_t ng_btsocket_sco_debug_level; static node_p ng_btsocket_sco_node; static struct ng_bt_itemq ng_btsocket_sco_queue; static struct mtx ng_btsocket_sco_queue_mtx; static struct task ng_btsocket_sco_queue_task; static struct mtx ng_btsocket_sco_sockets_mtx; static LIST_HEAD(, ng_btsocket_sco_pcb) ng_btsocket_sco_sockets; static LIST_HEAD(, ng_btsocket_sco_rtentry) ng_btsocket_sco_rt; static struct mtx ng_btsocket_sco_rt_mtx; static struct task ng_btsocket_sco_rt_task; static struct timeval ng_btsocket_sco_lasttime; static int ng_btsocket_sco_curpps; /* Sysctl tree */ SYSCTL_DECL(_net_bluetooth_sco_sockets); static SYSCTL_NODE(_net_bluetooth_sco_sockets, OID_AUTO, seq, CTLFLAG_RW, 0, "Bluetooth SEQPACKET SCO sockets family"); SYSCTL_UINT(_net_bluetooth_sco_sockets_seq, OID_AUTO, debug_level, CTLFLAG_RW, &ng_btsocket_sco_debug_level, NG_BTSOCKET_WARN_LEVEL, "Bluetooth SEQPACKET SCO sockets debug level"); SYSCTL_UINT(_net_bluetooth_sco_sockets_seq, OID_AUTO, queue_len, CTLFLAG_RD, &ng_btsocket_sco_queue.len, 0, "Bluetooth SEQPACKET SCO sockets input queue length"); SYSCTL_UINT(_net_bluetooth_sco_sockets_seq, OID_AUTO, queue_maxlen, CTLFLAG_RD, &ng_btsocket_sco_queue.maxlen, 0, "Bluetooth SEQPACKET SCO sockets input queue max. length"); SYSCTL_UINT(_net_bluetooth_sco_sockets_seq, OID_AUTO, queue_drops, CTLFLAG_RD, &ng_btsocket_sco_queue.drops, 0, "Bluetooth SEQPACKET SCO sockets input queue drops"); /* Debug */ #define NG_BTSOCKET_SCO_INFO \ if (ng_btsocket_sco_debug_level >= NG_BTSOCKET_INFO_LEVEL && \ ppsratecheck(&ng_btsocket_sco_lasttime, &ng_btsocket_sco_curpps, 1)) \ printf #define NG_BTSOCKET_SCO_WARN \ if (ng_btsocket_sco_debug_level >= NG_BTSOCKET_WARN_LEVEL && \ ppsratecheck(&ng_btsocket_sco_lasttime, &ng_btsocket_sco_curpps, 1)) \ printf #define NG_BTSOCKET_SCO_ERR \ if (ng_btsocket_sco_debug_level >= NG_BTSOCKET_ERR_LEVEL && \ ppsratecheck(&ng_btsocket_sco_lasttime, &ng_btsocket_sco_curpps, 1)) \ printf #define NG_BTSOCKET_SCO_ALERT \ if (ng_btsocket_sco_debug_level >= NG_BTSOCKET_ALERT_LEVEL && \ ppsratecheck(&ng_btsocket_sco_lasttime, &ng_btsocket_sco_curpps, 1)) \ printf /* * Netgraph message processing routines */ static int ng_btsocket_sco_process_lp_con_cfm (struct ng_mesg *, ng_btsocket_sco_rtentry_p); static int ng_btsocket_sco_process_lp_con_ind (struct ng_mesg *, ng_btsocket_sco_rtentry_p); static int ng_btsocket_sco_process_lp_discon_ind (struct ng_mesg *, ng_btsocket_sco_rtentry_p); /* * Send LP messages to the lower layer */ static int ng_btsocket_sco_send_lp_con_req (ng_btsocket_sco_pcb_p); static int ng_btsocket_sco_send_lp_con_rsp (ng_btsocket_sco_rtentry_p, bdaddr_p, int); static int ng_btsocket_sco_send_lp_discon_req (ng_btsocket_sco_pcb_p); static int ng_btsocket_sco_send2 (ng_btsocket_sco_pcb_p); /* * Timeout processing routines */ static void ng_btsocket_sco_timeout (ng_btsocket_sco_pcb_p); static void ng_btsocket_sco_untimeout (ng_btsocket_sco_pcb_p); static void ng_btsocket_sco_process_timeout (void *); /* * Other stuff */ static ng_btsocket_sco_pcb_p ng_btsocket_sco_pcb_by_addr(bdaddr_p); static ng_btsocket_sco_pcb_p ng_btsocket_sco_pcb_by_handle(bdaddr_p, int); static ng_btsocket_sco_pcb_p ng_btsocket_sco_pcb_by_addrs(bdaddr_p, bdaddr_p); #define ng_btsocket_sco_wakeup_input_task() \ taskqueue_enqueue(taskqueue_swi, &ng_btsocket_sco_queue_task) #define ng_btsocket_sco_wakeup_route_task() \ taskqueue_enqueue(taskqueue_swi, &ng_btsocket_sco_rt_task) /***************************************************************************** ***************************************************************************** ** Netgraph node interface ***************************************************************************** *****************************************************************************/ /* * Netgraph node constructor. Do not allow to create node of this type. */ static int ng_btsocket_sco_node_constructor(node_p node) { return (EINVAL); } /* ng_btsocket_sco_node_constructor */ /* * Do local shutdown processing. Let old node go and create new fresh one. */ static int ng_btsocket_sco_node_shutdown(node_p node) { int error = 0; NG_NODE_UNREF(node); /* Create new node */ error = ng_make_node_common(&typestruct, &ng_btsocket_sco_node); if (error != 0) { NG_BTSOCKET_SCO_ALERT( "%s: Could not create Netgraph node, error=%d\n", __func__, error); ng_btsocket_sco_node = NULL; return (error); } error = ng_name_node(ng_btsocket_sco_node, NG_BTSOCKET_SCO_NODE_TYPE); if (error != 0) { NG_BTSOCKET_SCO_ALERT( "%s: Could not name Netgraph node, error=%d\n", __func__, error); NG_NODE_UNREF(ng_btsocket_sco_node); ng_btsocket_sco_node = NULL; return (error); } return (0); } /* ng_btsocket_sco_node_shutdown */ /* * We allow any hook to be connected to the node. */ static int ng_btsocket_sco_node_newhook(node_p node, hook_p hook, char const *name) { return (0); } /* ng_btsocket_sco_node_newhook */ /* * Just say "YEP, that's OK by me!" */ static int ng_btsocket_sco_node_connect(hook_p hook) { NG_HOOK_SET_PRIVATE(hook, NULL); NG_HOOK_REF(hook); /* Keep extra reference to the hook */ #if 0 NG_HOOK_FORCE_QUEUE(NG_HOOK_PEER(hook)); NG_HOOK_FORCE_QUEUE(hook); #endif return (0); } /* ng_btsocket_sco_node_connect */ /* * Hook disconnection. Schedule route cleanup task */ static int ng_btsocket_sco_node_disconnect(hook_p hook) { /* * If hook has private information than we must have this hook in * the routing table and must schedule cleaning for the routing table. * Otherwise hook was connected but we never got "hook_info" message, * so we have never added this hook to the routing table and it save * to just delete it. */ if (NG_HOOK_PRIVATE(hook) != NULL) return (ng_btsocket_sco_wakeup_route_task()); NG_HOOK_UNREF(hook); /* Remove extra reference */ return (0); } /* ng_btsocket_sco_node_disconnect */ /* * Process incoming messages */ static int ng_btsocket_sco_node_rcvmsg(node_p node, item_p item, hook_p hook) { struct ng_mesg *msg = NGI_MSG(item); /* item still has message */ int error = 0; if (msg != NULL && msg->header.typecookie == NGM_HCI_COOKIE) { mtx_lock(&ng_btsocket_sco_queue_mtx); if (NG_BT_ITEMQ_FULL(&ng_btsocket_sco_queue)) { NG_BTSOCKET_SCO_ERR( "%s: Input queue is full (msg)\n", __func__); NG_BT_ITEMQ_DROP(&ng_btsocket_sco_queue); NG_FREE_ITEM(item); error = ENOBUFS; } else { if (hook != NULL) { NG_HOOK_REF(hook); NGI_SET_HOOK(item, hook); } NG_BT_ITEMQ_ENQUEUE(&ng_btsocket_sco_queue, item); error = ng_btsocket_sco_wakeup_input_task(); } mtx_unlock(&ng_btsocket_sco_queue_mtx); } else { NG_FREE_ITEM(item); error = EINVAL; } return (error); } /* ng_btsocket_sco_node_rcvmsg */ /* * Receive data on a hook */ static int ng_btsocket_sco_node_rcvdata(hook_p hook, item_p item) { int error = 0; mtx_lock(&ng_btsocket_sco_queue_mtx); if (NG_BT_ITEMQ_FULL(&ng_btsocket_sco_queue)) { NG_BTSOCKET_SCO_ERR( "%s: Input queue is full (data)\n", __func__); NG_BT_ITEMQ_DROP(&ng_btsocket_sco_queue); NG_FREE_ITEM(item); error = ENOBUFS; } else { NG_HOOK_REF(hook); NGI_SET_HOOK(item, hook); NG_BT_ITEMQ_ENQUEUE(&ng_btsocket_sco_queue, item); error = ng_btsocket_sco_wakeup_input_task(); } mtx_unlock(&ng_btsocket_sco_queue_mtx); return (error); } /* ng_btsocket_sco_node_rcvdata */ /* * Process LP_ConnectCfm event from the lower layer protocol */ static int ng_btsocket_sco_process_lp_con_cfm(struct ng_mesg *msg, ng_btsocket_sco_rtentry_p rt) { ng_hci_lp_con_cfm_ep *ep = NULL; ng_btsocket_sco_pcb_t *pcb = NULL; int error = 0; if (msg->header.arglen != sizeof(*ep)) return (EMSGSIZE); ep = (ng_hci_lp_con_cfm_ep *)(msg->data); mtx_lock(&ng_btsocket_sco_sockets_mtx); /* Look for the socket with the token */ pcb = ng_btsocket_sco_pcb_by_addrs(&rt->src, &ep->bdaddr); if (pcb == NULL) { mtx_unlock(&ng_btsocket_sco_sockets_mtx); return (ENOENT); } /* pcb is locked */ NG_BTSOCKET_SCO_INFO( "%s: Got LP_ConnectCfm response, src bdaddr=%x:%x:%x:%x:%x:%x, " \ "dst bdaddr=%x:%x:%x:%x:%x:%x, status=%d, handle=%d, state=%d\n", __func__, pcb->src.b[5], pcb->src.b[4], pcb->src.b[3], pcb->src.b[2], pcb->src.b[1], pcb->src.b[0], pcb->dst.b[5], pcb->dst.b[4], pcb->dst.b[3], pcb->dst.b[2], pcb->dst.b[1], pcb->dst.b[0], ep->status, ep->con_handle, pcb->state); if (pcb->state != NG_BTSOCKET_SCO_CONNECTING) { mtx_unlock(&pcb->pcb_mtx); mtx_unlock(&ng_btsocket_sco_sockets_mtx); return (ENOENT); } ng_btsocket_sco_untimeout(pcb); if (ep->status == 0) { /* * Connection is open. Update connection handle and * socket state */ pcb->con_handle = ep->con_handle; pcb->state = NG_BTSOCKET_SCO_OPEN; soisconnected(pcb->so); } else { /* * We have failed to open connection, so disconnect the socket */ pcb->so->so_error = ECONNREFUSED; /* XXX convert status ??? */ pcb->state = NG_BTSOCKET_SCO_CLOSED; soisdisconnected(pcb->so); } mtx_unlock(&pcb->pcb_mtx); mtx_unlock(&ng_btsocket_sco_sockets_mtx); return (error); } /* ng_btsocket_sco_process_lp_con_cfm */ /* * Process LP_ConnectInd indicator. Find socket that listens on address. * Find exact or closest match. */ static int ng_btsocket_sco_process_lp_con_ind(struct ng_mesg *msg, ng_btsocket_sco_rtentry_p rt) { ng_hci_lp_con_ind_ep *ep = NULL; ng_btsocket_sco_pcb_t *pcb = NULL, *pcb1 = NULL; int error = 0; u_int16_t status = 0; if (msg->header.arglen != sizeof(*ep)) return (EMSGSIZE); ep = (ng_hci_lp_con_ind_ep *)(msg->data); NG_BTSOCKET_SCO_INFO( "%s: Got LP_ConnectInd indicator, src bdaddr=%x:%x:%x:%x:%x:%x, " \ "dst bdaddr=%x:%x:%x:%x:%x:%x\n", __func__, rt->src.b[5], rt->src.b[4], rt->src.b[3], rt->src.b[2], rt->src.b[1], rt->src.b[0], ep->bdaddr.b[5], ep->bdaddr.b[4], ep->bdaddr.b[3], ep->bdaddr.b[2], ep->bdaddr.b[1], ep->bdaddr.b[0]); mtx_lock(&ng_btsocket_sco_sockets_mtx); pcb = ng_btsocket_sco_pcb_by_addr(&rt->src); if (pcb != NULL) { - struct socket *so1 = NULL; + struct socket *so1; /* pcb is locked */ - /* - * First check the pending connections queue and if we have - * space then create new socket and set proper source address. - */ - - if (pcb->so->so_qlen <= pcb->so->so_qlimit) { - CURVNET_SET(pcb->so->so_vnet); - so1 = sonewconn(pcb->so, 0); - CURVNET_RESTORE(); - } + CURVNET_SET(pcb->so->so_vnet); + so1 = sonewconn(pcb->so, 0); + CURVNET_RESTORE(); if (so1 == NULL) { status = 0x0d; /* Rejected due to limited resources */ goto respond; } /* * If we got here than we have created new socket. So complete * connection. If we we listening on specific address then copy * source address from listening socket, otherwise copy source * address from hook's routing information. */ pcb1 = so2sco_pcb(so1); KASSERT((pcb1 != NULL), ("%s: pcb1 == NULL\n", __func__)); mtx_lock(&pcb1->pcb_mtx); if (bcmp(&pcb->src, NG_HCI_BDADDR_ANY, sizeof(pcb->src)) != 0) bcopy(&pcb->src, &pcb1->src, sizeof(pcb1->src)); else bcopy(&rt->src, &pcb1->src, sizeof(pcb1->src)); pcb1->flags &= ~NG_BTSOCKET_SCO_CLIENT; bcopy(&ep->bdaddr, &pcb1->dst, sizeof(pcb1->dst)); pcb1->rt = rt; } else /* Nobody listens on requested BDADDR */ status = 0x1f; /* Unspecified Error */ respond: error = ng_btsocket_sco_send_lp_con_rsp(rt, &ep->bdaddr, status); if (pcb1 != NULL) { if (error != 0) { pcb1->so->so_error = error; pcb1->state = NG_BTSOCKET_SCO_CLOSED; soisdisconnected(pcb1->so); } else { pcb1->state = NG_BTSOCKET_SCO_CONNECTING; soisconnecting(pcb1->so); ng_btsocket_sco_timeout(pcb1); } mtx_unlock(&pcb1->pcb_mtx); } if (pcb != NULL) mtx_unlock(&pcb->pcb_mtx); mtx_unlock(&ng_btsocket_sco_sockets_mtx); return (error); } /* ng_btsocket_sco_process_lp_con_ind */ /* * Process LP_DisconnectInd indicator */ static int ng_btsocket_sco_process_lp_discon_ind(struct ng_mesg *msg, ng_btsocket_sco_rtentry_p rt) { ng_hci_lp_discon_ind_ep *ep = NULL; ng_btsocket_sco_pcb_t *pcb = NULL; /* Check message */ if (msg->header.arglen != sizeof(*ep)) return (EMSGSIZE); ep = (ng_hci_lp_discon_ind_ep *)(msg->data); mtx_lock(&ng_btsocket_sco_sockets_mtx); /* Look for the socket with given channel ID */ pcb = ng_btsocket_sco_pcb_by_handle(&rt->src, ep->con_handle); if (pcb == NULL) { mtx_unlock(&ng_btsocket_sco_sockets_mtx); return (0); } /* * Disconnect the socket. If there was any pending request we can * not do anything here anyway. */ /* pcb is locked */ NG_BTSOCKET_SCO_INFO( "%s: Got LP_DisconnectInd indicator, src bdaddr=%x:%x:%x:%x:%x:%x, " \ "dst bdaddr=%x:%x:%x:%x:%x:%x, handle=%d, state=%d\n", __func__, pcb->src.b[5], pcb->src.b[4], pcb->src.b[3], pcb->src.b[2], pcb->src.b[1], pcb->src.b[0], pcb->dst.b[5], pcb->dst.b[4], pcb->dst.b[3], pcb->dst.b[2], pcb->dst.b[1], pcb->dst.b[0], pcb->con_handle, pcb->state); if (pcb->flags & NG_BTSOCKET_SCO_TIMO) ng_btsocket_sco_untimeout(pcb); pcb->state = NG_BTSOCKET_SCO_CLOSED; soisdisconnected(pcb->so); mtx_unlock(&pcb->pcb_mtx); mtx_unlock(&ng_btsocket_sco_sockets_mtx); return (0); } /* ng_btsocket_sco_process_lp_discon_ind */ /* * Send LP_ConnectReq request */ static int ng_btsocket_sco_send_lp_con_req(ng_btsocket_sco_pcb_p pcb) { struct ng_mesg *msg = NULL; ng_hci_lp_con_req_ep *ep = NULL; int error = 0; mtx_assert(&pcb->pcb_mtx, MA_OWNED); if (pcb->rt == NULL || pcb->rt->hook == NULL || NG_HOOK_NOT_VALID(pcb->rt->hook)) return (ENETDOWN); NG_MKMESSAGE(msg, NGM_HCI_COOKIE, NGM_HCI_LP_CON_REQ, sizeof(*ep), M_NOWAIT); if (msg == NULL) return (ENOMEM); ep = (ng_hci_lp_con_req_ep *)(msg->data); ep->link_type = NG_HCI_LINK_SCO; bcopy(&pcb->dst, &ep->bdaddr, sizeof(ep->bdaddr)); NG_SEND_MSG_HOOK(error, ng_btsocket_sco_node, msg, pcb->rt->hook, 0); return (error); } /* ng_btsocket_sco_send_lp_con_req */ /* * Send LP_ConnectRsp response */ static int ng_btsocket_sco_send_lp_con_rsp(ng_btsocket_sco_rtentry_p rt, bdaddr_p dst, int status) { struct ng_mesg *msg = NULL; ng_hci_lp_con_rsp_ep *ep = NULL; int error = 0; if (rt == NULL || rt->hook == NULL || NG_HOOK_NOT_VALID(rt->hook)) return (ENETDOWN); NG_MKMESSAGE(msg, NGM_HCI_COOKIE, NGM_HCI_LP_CON_RSP, sizeof(*ep), M_NOWAIT); if (msg == NULL) return (ENOMEM); ep = (ng_hci_lp_con_rsp_ep *)(msg->data); ep->status = status; ep->link_type = NG_HCI_LINK_SCO; bcopy(dst, &ep->bdaddr, sizeof(ep->bdaddr)); NG_SEND_MSG_HOOK(error, ng_btsocket_sco_node, msg, rt->hook, 0); return (error); } /* ng_btsocket_sco_send_lp_con_rsp */ /* * Send LP_DisconReq request */ static int ng_btsocket_sco_send_lp_discon_req(ng_btsocket_sco_pcb_p pcb) { struct ng_mesg *msg = NULL; ng_hci_lp_discon_req_ep *ep = NULL; int error = 0; mtx_assert(&pcb->pcb_mtx, MA_OWNED); if (pcb->rt == NULL || pcb->rt->hook == NULL || NG_HOOK_NOT_VALID(pcb->rt->hook)) return (ENETDOWN); NG_MKMESSAGE(msg, NGM_HCI_COOKIE, NGM_HCI_LP_DISCON_REQ, sizeof(*ep), M_NOWAIT); if (msg == NULL) return (ENOMEM); ep = (ng_hci_lp_discon_req_ep *)(msg->data); ep->con_handle = pcb->con_handle; ep->reason = 0x13; /* User Ended Connection */ NG_SEND_MSG_HOOK(error, ng_btsocket_sco_node, msg, pcb->rt->hook, 0); return (error); } /* ng_btsocket_sco_send_lp_discon_req */ /***************************************************************************** ***************************************************************************** ** Socket interface ***************************************************************************** *****************************************************************************/ /* * SCO sockets data input routine */ static void ng_btsocket_sco_data_input(struct mbuf *m, hook_p hook) { ng_hci_scodata_pkt_t *hdr = NULL; ng_btsocket_sco_pcb_t *pcb = NULL; ng_btsocket_sco_rtentry_t *rt = NULL; u_int16_t con_handle; if (hook == NULL) { NG_BTSOCKET_SCO_ALERT( "%s: Invalid source hook for SCO data packet\n", __func__); goto drop; } rt = (ng_btsocket_sco_rtentry_t *) NG_HOOK_PRIVATE(hook); if (rt == NULL) { NG_BTSOCKET_SCO_ALERT( "%s: Could not find out source bdaddr for SCO data packet\n", __func__); goto drop; } /* Make sure we can access header */ if (m->m_pkthdr.len < sizeof(*hdr)) { NG_BTSOCKET_SCO_ERR( "%s: SCO data packet too small, len=%d\n", __func__, m->m_pkthdr.len); goto drop; } if (m->m_len < sizeof(*hdr)) { m = m_pullup(m, sizeof(*hdr)); if (m == NULL) goto drop; } /* Strip SCO packet header and verify packet length */ hdr = mtod(m, ng_hci_scodata_pkt_t *); m_adj(m, sizeof(*hdr)); if (hdr->length != m->m_pkthdr.len) { NG_BTSOCKET_SCO_ERR( "%s: Bad SCO data packet length, len=%d, length=%d\n", __func__, m->m_pkthdr.len, hdr->length); goto drop; } /* * Now process packet */ con_handle = NG_HCI_CON_HANDLE(le16toh(hdr->con_handle)); NG_BTSOCKET_SCO_INFO( "%s: Received SCO data packet: src bdaddr=%x:%x:%x:%x:%x:%x, handle=%d, " \ "length=%d\n", __func__, rt->src.b[5], rt->src.b[4], rt->src.b[3], rt->src.b[2], rt->src.b[1], rt->src.b[0], con_handle, hdr->length); mtx_lock(&ng_btsocket_sco_sockets_mtx); /* Find socket */ pcb = ng_btsocket_sco_pcb_by_handle(&rt->src, con_handle); if (pcb == NULL) { mtx_unlock(&ng_btsocket_sco_sockets_mtx); goto drop; } /* pcb is locked */ if (pcb->state != NG_BTSOCKET_SCO_OPEN) { NG_BTSOCKET_SCO_ERR( "%s: No connected socket found, src bdaddr=%x:%x:%x:%x:%x:%x, state=%d\n", __func__, rt->src.b[5], rt->src.b[4], rt->src.b[3], rt->src.b[2], rt->src.b[1], rt->src.b[0], pcb->state); mtx_unlock(&pcb->pcb_mtx); mtx_unlock(&ng_btsocket_sco_sockets_mtx); goto drop; } /* Check if we have enough space in socket receive queue */ if (m->m_pkthdr.len > sbspace(&pcb->so->so_rcv)) { NG_BTSOCKET_SCO_ERR( "%s: Not enough space in socket receive queue. Dropping SCO data packet, " \ "src bdaddr=%x:%x:%x:%x:%x:%x, len=%d, space=%ld\n", __func__, rt->src.b[5], rt->src.b[4], rt->src.b[3], rt->src.b[2], rt->src.b[1], rt->src.b[0], m->m_pkthdr.len, sbspace(&pcb->so->so_rcv)); mtx_unlock(&pcb->pcb_mtx); mtx_unlock(&ng_btsocket_sco_sockets_mtx); goto drop; } /* Append packet to the socket receive queue and wakeup */ sbappendrecord(&pcb->so->so_rcv, m); m = NULL; sorwakeup(pcb->so); mtx_unlock(&pcb->pcb_mtx); mtx_unlock(&ng_btsocket_sco_sockets_mtx); drop: NG_FREE_M(m); /* checks for m != NULL */ } /* ng_btsocket_sco_data_input */ /* * SCO sockets default message input routine */ static void ng_btsocket_sco_default_msg_input(struct ng_mesg *msg, hook_p hook) { ng_btsocket_sco_rtentry_t *rt = NULL; if (hook == NULL || NG_HOOK_NOT_VALID(hook)) return; rt = (ng_btsocket_sco_rtentry_t *) NG_HOOK_PRIVATE(hook); switch (msg->header.cmd) { case NGM_HCI_NODE_UP: { ng_hci_node_up_ep *ep = NULL; if (msg->header.arglen != sizeof(*ep)) break; ep = (ng_hci_node_up_ep *)(msg->data); if (bcmp(&ep->bdaddr, NG_HCI_BDADDR_ANY, sizeof(bdaddr_t)) == 0) break; if (rt == NULL) { rt = malloc(sizeof(*rt), M_NETGRAPH_BTSOCKET_SCO, M_NOWAIT|M_ZERO); if (rt == NULL) break; NG_HOOK_SET_PRIVATE(hook, rt); mtx_lock(&ng_btsocket_sco_rt_mtx); LIST_INSERT_HEAD(&ng_btsocket_sco_rt, rt, next); } else mtx_lock(&ng_btsocket_sco_rt_mtx); bcopy(&ep->bdaddr, &rt->src, sizeof(rt->src)); rt->pkt_size = (ep->pkt_size == 0)? 60 : ep->pkt_size; rt->num_pkts = ep->num_pkts; rt->hook = hook; mtx_unlock(&ng_btsocket_sco_rt_mtx); NG_BTSOCKET_SCO_INFO( "%s: Updating hook \"%s\", src bdaddr=%x:%x:%x:%x:%x:%x, pkt_size=%d, " \ "num_pkts=%d\n", __func__, NG_HOOK_NAME(hook), rt->src.b[5], rt->src.b[4], rt->src.b[3], rt->src.b[2], rt->src.b[1], rt->src.b[0], rt->pkt_size, rt->num_pkts); } break; case NGM_HCI_SYNC_CON_QUEUE: { ng_hci_sync_con_queue_ep *ep = NULL; ng_btsocket_sco_pcb_t *pcb = NULL; if (rt == NULL || msg->header.arglen != sizeof(*ep)) break; ep = (ng_hci_sync_con_queue_ep *)(msg->data); rt->pending -= ep->completed; if (rt->pending < 0) { NG_BTSOCKET_SCO_WARN( "%s: Pending packet counter is out of sync! bdaddr=%x:%x:%x:%x:%x:%x, " \ "handle=%d, pending=%d, completed=%d\n", __func__, rt->src.b[5], rt->src.b[4], rt->src.b[3], rt->src.b[2], rt->src.b[1], rt->src.b[0], ep->con_handle, rt->pending, ep->completed); rt->pending = 0; } mtx_lock(&ng_btsocket_sco_sockets_mtx); /* Find socket */ pcb = ng_btsocket_sco_pcb_by_handle(&rt->src, ep->con_handle); if (pcb == NULL) { mtx_unlock(&ng_btsocket_sco_sockets_mtx); break; } /* pcb is locked */ /* Check state */ if (pcb->state == NG_BTSOCKET_SCO_OPEN) { /* Remove timeout */ ng_btsocket_sco_untimeout(pcb); /* Drop completed packets from the send queue */ for (; ep->completed > 0; ep->completed --) sbdroprecord(&pcb->so->so_snd); /* Send more if we have any */ if (sbavail(&pcb->so->so_snd) > 0) if (ng_btsocket_sco_send2(pcb) == 0) ng_btsocket_sco_timeout(pcb); /* Wake up writers */ sowwakeup(pcb->so); } mtx_unlock(&pcb->pcb_mtx); mtx_unlock(&ng_btsocket_sco_sockets_mtx); } break; default: NG_BTSOCKET_SCO_WARN( "%s: Unknown message, cmd=%d\n", __func__, msg->header.cmd); break; } NG_FREE_MSG(msg); /* Checks for msg != NULL */ } /* ng_btsocket_sco_default_msg_input */ /* * SCO sockets LP message input routine */ static void ng_btsocket_sco_lp_msg_input(struct ng_mesg *msg, hook_p hook) { ng_btsocket_sco_rtentry_p rt = NULL; if (hook == NULL) { NG_BTSOCKET_SCO_ALERT( "%s: Invalid source hook for LP message\n", __func__); goto drop; } rt = (ng_btsocket_sco_rtentry_p) NG_HOOK_PRIVATE(hook); if (rt == NULL) { NG_BTSOCKET_SCO_ALERT( "%s: Could not find out source bdaddr for LP message\n", __func__); goto drop; } switch (msg->header.cmd) { case NGM_HCI_LP_CON_CFM: /* Connection Confirmation Event */ ng_btsocket_sco_process_lp_con_cfm(msg, rt); break; case NGM_HCI_LP_CON_IND: /* Connection Indication Event */ ng_btsocket_sco_process_lp_con_ind(msg, rt); break; case NGM_HCI_LP_DISCON_IND: /* Disconnection Indication Event */ ng_btsocket_sco_process_lp_discon_ind(msg, rt); break; /* XXX FIXME add other LP messages */ default: NG_BTSOCKET_SCO_WARN( "%s: Unknown LP message, cmd=%d\n", __func__, msg->header.cmd); break; } drop: NG_FREE_MSG(msg); } /* ng_btsocket_sco_lp_msg_input */ /* * SCO sockets input routine */ static void ng_btsocket_sco_input(void *context, int pending) { item_p item = NULL; hook_p hook = NULL; for (;;) { mtx_lock(&ng_btsocket_sco_queue_mtx); NG_BT_ITEMQ_DEQUEUE(&ng_btsocket_sco_queue, item); mtx_unlock(&ng_btsocket_sco_queue_mtx); if (item == NULL) break; NGI_GET_HOOK(item, hook); if (hook != NULL && NG_HOOK_NOT_VALID(hook)) goto drop; switch(item->el_flags & NGQF_TYPE) { case NGQF_DATA: { struct mbuf *m = NULL; NGI_GET_M(item, m); ng_btsocket_sco_data_input(m, hook); } break; case NGQF_MESG: { struct ng_mesg *msg = NULL; NGI_GET_MSG(item, msg); switch (msg->header.cmd) { case NGM_HCI_LP_CON_CFM: case NGM_HCI_LP_CON_IND: case NGM_HCI_LP_DISCON_IND: /* XXX FIXME add other LP messages */ ng_btsocket_sco_lp_msg_input(msg, hook); break; default: ng_btsocket_sco_default_msg_input(msg, hook); break; } } break; default: KASSERT(0, ("%s: invalid item type=%ld\n", __func__, (item->el_flags & NGQF_TYPE))); break; } drop: if (hook != NULL) NG_HOOK_UNREF(hook); NG_FREE_ITEM(item); } } /* ng_btsocket_sco_input */ /* * Route cleanup task. Gets scheduled when hook is disconnected. Here we * will find all sockets that use "invalid" hook and disconnect them. */ static void ng_btsocket_sco_rtclean(void *context, int pending) { ng_btsocket_sco_pcb_p pcb = NULL, pcb_next = NULL; ng_btsocket_sco_rtentry_p rt = NULL; /* * First disconnect all sockets that use "invalid" hook */ mtx_lock(&ng_btsocket_sco_sockets_mtx); for(pcb = LIST_FIRST(&ng_btsocket_sco_sockets); pcb != NULL; ) { mtx_lock(&pcb->pcb_mtx); pcb_next = LIST_NEXT(pcb, next); if (pcb->rt != NULL && pcb->rt->hook != NULL && NG_HOOK_NOT_VALID(pcb->rt->hook)) { if (pcb->flags & NG_BTSOCKET_SCO_TIMO) ng_btsocket_sco_untimeout(pcb); pcb->rt = NULL; pcb->so->so_error = ENETDOWN; pcb->state = NG_BTSOCKET_SCO_CLOSED; soisdisconnected(pcb->so); } mtx_unlock(&pcb->pcb_mtx); pcb = pcb_next; } mtx_unlock(&ng_btsocket_sco_sockets_mtx); /* * Now cleanup routing table */ mtx_lock(&ng_btsocket_sco_rt_mtx); for (rt = LIST_FIRST(&ng_btsocket_sco_rt); rt != NULL; ) { ng_btsocket_sco_rtentry_p rt_next = LIST_NEXT(rt, next); if (rt->hook != NULL && NG_HOOK_NOT_VALID(rt->hook)) { LIST_REMOVE(rt, next); NG_HOOK_SET_PRIVATE(rt->hook, NULL); NG_HOOK_UNREF(rt->hook); /* Remove extra reference */ bzero(rt, sizeof(*rt)); free(rt, M_NETGRAPH_BTSOCKET_SCO); } rt = rt_next; } mtx_unlock(&ng_btsocket_sco_rt_mtx); } /* ng_btsocket_sco_rtclean */ /* * Initialize everything */ void ng_btsocket_sco_init(void) { int error = 0; /* Skip initialization of globals for non-default instances. */ if (!IS_DEFAULT_VNET(curvnet)) return; ng_btsocket_sco_node = NULL; ng_btsocket_sco_debug_level = NG_BTSOCKET_WARN_LEVEL; /* Register Netgraph node type */ error = ng_newtype(&typestruct); if (error != 0) { NG_BTSOCKET_SCO_ALERT( "%s: Could not register Netgraph node type, error=%d\n", __func__, error); return; } /* Create Netgrapg node */ error = ng_make_node_common(&typestruct, &ng_btsocket_sco_node); if (error != 0) { NG_BTSOCKET_SCO_ALERT( "%s: Could not create Netgraph node, error=%d\n", __func__, error); ng_btsocket_sco_node = NULL; return; } error = ng_name_node(ng_btsocket_sco_node, NG_BTSOCKET_SCO_NODE_TYPE); if (error != 0) { NG_BTSOCKET_SCO_ALERT( "%s: Could not name Netgraph node, error=%d\n", __func__, error); NG_NODE_UNREF(ng_btsocket_sco_node); ng_btsocket_sco_node = NULL; return; } /* Create input queue */ NG_BT_ITEMQ_INIT(&ng_btsocket_sco_queue, 300); mtx_init(&ng_btsocket_sco_queue_mtx, "btsocks_sco_queue_mtx", NULL, MTX_DEF); TASK_INIT(&ng_btsocket_sco_queue_task, 0, ng_btsocket_sco_input, NULL); /* Create list of sockets */ LIST_INIT(&ng_btsocket_sco_sockets); mtx_init(&ng_btsocket_sco_sockets_mtx, "btsocks_sco_sockets_mtx", NULL, MTX_DEF); /* Routing table */ LIST_INIT(&ng_btsocket_sco_rt); mtx_init(&ng_btsocket_sco_rt_mtx, "btsocks_sco_rt_mtx", NULL, MTX_DEF); TASK_INIT(&ng_btsocket_sco_rt_task, 0, ng_btsocket_sco_rtclean, NULL); } /* ng_btsocket_sco_init */ /* * Abort connection on socket */ void ng_btsocket_sco_abort(struct socket *so) { so->so_error = ECONNABORTED; (void) ng_btsocket_sco_disconnect(so); } /* ng_btsocket_sco_abort */ void ng_btsocket_sco_close(struct socket *so) { (void) ng_btsocket_sco_disconnect(so); } /* ng_btsocket_sco_close */ /* * Accept connection on socket. Nothing to do here, socket must be connected * and ready, so just return peer address and be done with it. */ int ng_btsocket_sco_accept(struct socket *so, struct sockaddr **nam) { if (ng_btsocket_sco_node == NULL) return (EINVAL); return (ng_btsocket_sco_peeraddr(so, nam)); } /* ng_btsocket_sco_accept */ /* * Create and attach new socket */ int ng_btsocket_sco_attach(struct socket *so, int proto, struct thread *td) { ng_btsocket_sco_pcb_p pcb = so2sco_pcb(so); int error; /* Check socket and protocol */ if (ng_btsocket_sco_node == NULL) return (EPROTONOSUPPORT); if (so->so_type != SOCK_SEQPACKET) return (ESOCKTNOSUPPORT); #if 0 /* XXX sonewconn() calls "pru_attach" with proto == 0 */ if (proto != 0) if (proto != BLUETOOTH_PROTO_SCO) return (EPROTONOSUPPORT); #endif /* XXX */ if (pcb != NULL) return (EISCONN); /* Reserve send and receive space if it is not reserved yet */ if ((so->so_snd.sb_hiwat == 0) || (so->so_rcv.sb_hiwat == 0)) { error = soreserve(so, NG_BTSOCKET_SCO_SENDSPACE, NG_BTSOCKET_SCO_RECVSPACE); if (error != 0) return (error); } /* Allocate the PCB */ pcb = malloc(sizeof(*pcb), M_NETGRAPH_BTSOCKET_SCO, M_NOWAIT | M_ZERO); if (pcb == NULL) return (ENOMEM); /* Link the PCB and the socket */ so->so_pcb = (caddr_t) pcb; pcb->so = so; pcb->state = NG_BTSOCKET_SCO_CLOSED; callout_init(&pcb->timo, 1); /* * Mark PCB mutex as DUPOK to prevent "duplicated lock of * the same type" message. When accepting new SCO connection * ng_btsocket_sco_process_lp_con_ind() holds both PCB mutexes * for "old" (accepting) PCB and "new" (created) PCB. */ mtx_init(&pcb->pcb_mtx, "btsocks_sco_pcb_mtx", NULL, MTX_DEF|MTX_DUPOK); /* * Add the PCB to the list * * XXX FIXME VERY IMPORTANT! * * This is totally FUBAR. We could get here in two cases: * * 1) When user calls socket() * 2) When we need to accept new incoming connection and call * sonewconn() * * In the first case we must acquire ng_btsocket_sco_sockets_mtx. * In the second case we hold ng_btsocket_sco_sockets_mtx already. * So we now need to distinguish between these cases. From reading * /sys/kern/uipc_socket2.c we can find out that sonewconn() calls * pru_attach with proto == 0 and td == NULL. For now use this fact * to figure out if we were called from socket() or from sonewconn(). */ if (td != NULL) mtx_lock(&ng_btsocket_sco_sockets_mtx); else mtx_assert(&ng_btsocket_sco_sockets_mtx, MA_OWNED); LIST_INSERT_HEAD(&ng_btsocket_sco_sockets, pcb, next); if (td != NULL) mtx_unlock(&ng_btsocket_sco_sockets_mtx); return (0); } /* ng_btsocket_sco_attach */ /* * Bind socket */ int ng_btsocket_sco_bind(struct socket *so, struct sockaddr *nam, struct thread *td) { ng_btsocket_sco_pcb_t *pcb = NULL; struct sockaddr_sco *sa = (struct sockaddr_sco *) nam; if (ng_btsocket_sco_node == NULL) return (EINVAL); /* Verify address */ if (sa == NULL) return (EINVAL); if (sa->sco_family != AF_BLUETOOTH) return (EAFNOSUPPORT); if (sa->sco_len != sizeof(*sa)) return (EINVAL); mtx_lock(&ng_btsocket_sco_sockets_mtx); /* * Check if other socket has this address already (look for exact * match in bdaddr) and assign socket address if it's available. */ if (bcmp(&sa->sco_bdaddr, NG_HCI_BDADDR_ANY, sizeof(sa->sco_bdaddr)) != 0) { LIST_FOREACH(pcb, &ng_btsocket_sco_sockets, next) { mtx_lock(&pcb->pcb_mtx); if (bcmp(&pcb->src, &sa->sco_bdaddr, sizeof(bdaddr_t)) == 0) { mtx_unlock(&pcb->pcb_mtx); mtx_unlock(&ng_btsocket_sco_sockets_mtx); return (EADDRINUSE); } mtx_unlock(&pcb->pcb_mtx); } } pcb = so2sco_pcb(so); if (pcb == NULL) { mtx_unlock(&ng_btsocket_sco_sockets_mtx); return (EINVAL); } mtx_lock(&pcb->pcb_mtx); bcopy(&sa->sco_bdaddr, &pcb->src, sizeof(pcb->src)); mtx_unlock(&pcb->pcb_mtx); mtx_unlock(&ng_btsocket_sco_sockets_mtx); return (0); } /* ng_btsocket_sco_bind */ /* * Connect socket */ int ng_btsocket_sco_connect(struct socket *so, struct sockaddr *nam, struct thread *td) { ng_btsocket_sco_pcb_t *pcb = so2sco_pcb(so); struct sockaddr_sco *sa = (struct sockaddr_sco *) nam; ng_btsocket_sco_rtentry_t *rt = NULL; int have_src, error = 0; /* Check socket */ if (pcb == NULL) return (EINVAL); if (ng_btsocket_sco_node == NULL) return (EINVAL); /* Verify address */ if (sa == NULL) return (EINVAL); if (sa->sco_family != AF_BLUETOOTH) return (EAFNOSUPPORT); if (sa->sco_len != sizeof(*sa)) return (EINVAL); if (bcmp(&sa->sco_bdaddr, NG_HCI_BDADDR_ANY, sizeof(bdaddr_t)) == 0) return (EDESTADDRREQ); /* * Routing. Socket should be bound to some source address. The source * address can be ANY. Destination address must be set and it must not * be ANY. If source address is ANY then find first rtentry that has * src != dst. */ mtx_lock(&ng_btsocket_sco_rt_mtx); mtx_lock(&pcb->pcb_mtx); if (pcb->state == NG_BTSOCKET_SCO_CONNECTING) { mtx_unlock(&pcb->pcb_mtx); mtx_unlock(&ng_btsocket_sco_rt_mtx); return (EINPROGRESS); } if (bcmp(&sa->sco_bdaddr, &pcb->src, sizeof(pcb->src)) == 0) { mtx_unlock(&pcb->pcb_mtx); mtx_unlock(&ng_btsocket_sco_rt_mtx); return (EINVAL); } /* Send destination address and PSM */ bcopy(&sa->sco_bdaddr, &pcb->dst, sizeof(pcb->dst)); pcb->rt = NULL; have_src = bcmp(&pcb->src, NG_HCI_BDADDR_ANY, sizeof(pcb->src)); LIST_FOREACH(rt, &ng_btsocket_sco_rt, next) { if (rt->hook == NULL || NG_HOOK_NOT_VALID(rt->hook)) continue; /* Match src and dst */ if (have_src) { if (bcmp(&pcb->src, &rt->src, sizeof(rt->src)) == 0) break; } else { if (bcmp(&pcb->dst, &rt->src, sizeof(rt->src)) != 0) break; } } if (rt != NULL) { pcb->rt = rt; if (!have_src) bcopy(&rt->src, &pcb->src, sizeof(pcb->src)); } else error = EHOSTUNREACH; /* * Send LP_Connect request */ if (error == 0) { error = ng_btsocket_sco_send_lp_con_req(pcb); if (error == 0) { pcb->flags |= NG_BTSOCKET_SCO_CLIENT; pcb->state = NG_BTSOCKET_SCO_CONNECTING; soisconnecting(pcb->so); ng_btsocket_sco_timeout(pcb); } } mtx_unlock(&pcb->pcb_mtx); mtx_unlock(&ng_btsocket_sco_rt_mtx); return (error); } /* ng_btsocket_sco_connect */ /* * Process ioctl's calls on socket */ int ng_btsocket_sco_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, struct thread *td) { return (EINVAL); } /* ng_btsocket_sco_control */ /* * Process getsockopt/setsockopt system calls */ int ng_btsocket_sco_ctloutput(struct socket *so, struct sockopt *sopt) { ng_btsocket_sco_pcb_p pcb = so2sco_pcb(so); int error, tmp; if (ng_btsocket_sco_node == NULL) return (EINVAL); if (pcb == NULL) return (EINVAL); if (sopt->sopt_level != SOL_SCO) return (0); mtx_lock(&pcb->pcb_mtx); switch (sopt->sopt_dir) { case SOPT_GET: if (pcb->state != NG_BTSOCKET_SCO_OPEN) { error = ENOTCONN; break; } switch (sopt->sopt_name) { case SO_SCO_MTU: tmp = pcb->rt->pkt_size; error = sooptcopyout(sopt, &tmp, sizeof(tmp)); break; case SO_SCO_CONNINFO: tmp = pcb->con_handle; error = sooptcopyout(sopt, &tmp, sizeof(tmp)); break; default: error = EINVAL; break; } break; case SOPT_SET: error = ENOPROTOOPT; break; default: error = EINVAL; break; } mtx_unlock(&pcb->pcb_mtx); return (error); } /* ng_btsocket_sco_ctloutput */ /* * Detach and destroy socket */ void ng_btsocket_sco_detach(struct socket *so) { ng_btsocket_sco_pcb_p pcb = so2sco_pcb(so); KASSERT(pcb != NULL, ("ng_btsocket_sco_detach: pcb == NULL")); if (ng_btsocket_sco_node == NULL) return; mtx_lock(&ng_btsocket_sco_sockets_mtx); mtx_lock(&pcb->pcb_mtx); if (pcb->flags & NG_BTSOCKET_SCO_TIMO) ng_btsocket_sco_untimeout(pcb); if (pcb->state == NG_BTSOCKET_SCO_OPEN) ng_btsocket_sco_send_lp_discon_req(pcb); pcb->state = NG_BTSOCKET_SCO_CLOSED; LIST_REMOVE(pcb, next); mtx_unlock(&pcb->pcb_mtx); mtx_unlock(&ng_btsocket_sco_sockets_mtx); mtx_destroy(&pcb->pcb_mtx); bzero(pcb, sizeof(*pcb)); free(pcb, M_NETGRAPH_BTSOCKET_SCO); soisdisconnected(so); so->so_pcb = NULL; } /* ng_btsocket_sco_detach */ /* * Disconnect socket */ int ng_btsocket_sco_disconnect(struct socket *so) { ng_btsocket_sco_pcb_p pcb = so2sco_pcb(so); if (pcb == NULL) return (EINVAL); if (ng_btsocket_sco_node == NULL) return (EINVAL); mtx_lock(&pcb->pcb_mtx); if (pcb->state == NG_BTSOCKET_SCO_DISCONNECTING) { mtx_unlock(&pcb->pcb_mtx); return (EINPROGRESS); } if (pcb->flags & NG_BTSOCKET_SCO_TIMO) ng_btsocket_sco_untimeout(pcb); if (pcb->state == NG_BTSOCKET_SCO_OPEN) { ng_btsocket_sco_send_lp_discon_req(pcb); pcb->state = NG_BTSOCKET_SCO_DISCONNECTING; soisdisconnecting(so); ng_btsocket_sco_timeout(pcb); } else { pcb->state = NG_BTSOCKET_SCO_CLOSED; soisdisconnected(so); } mtx_unlock(&pcb->pcb_mtx); return (0); } /* ng_btsocket_sco_disconnect */ /* * Listen on socket */ int ng_btsocket_sco_listen(struct socket *so, int backlog, struct thread *td) { ng_btsocket_sco_pcb_p pcb = so2sco_pcb(so); int error; if (pcb == NULL) return (EINVAL); if (ng_btsocket_sco_node == NULL) return (EINVAL); SOCK_LOCK(so); mtx_lock(&pcb->pcb_mtx); error = solisten_proto_check(so); if (error != 0) goto out; #if 0 if (bcmp(&pcb->src, NG_HCI_BDADDR_ANY, sizeof(bdaddr_t)) == 0) { error = EDESTADDRREQ; goto out; } #endif solisten_proto(so, backlog); out: mtx_unlock(&pcb->pcb_mtx); SOCK_UNLOCK(so); return (error); } /* ng_btsocket_listen */ /* * Get peer address */ int ng_btsocket_sco_peeraddr(struct socket *so, struct sockaddr **nam) { ng_btsocket_sco_pcb_p pcb = so2sco_pcb(so); struct sockaddr_sco sa; if (pcb == NULL) return (EINVAL); if (ng_btsocket_sco_node == NULL) return (EINVAL); mtx_lock(&pcb->pcb_mtx); bcopy(&pcb->dst, &sa.sco_bdaddr, sizeof(sa.sco_bdaddr)); mtx_unlock(&pcb->pcb_mtx); sa.sco_len = sizeof(sa); sa.sco_family = AF_BLUETOOTH; *nam = sodupsockaddr((struct sockaddr *) &sa, M_NOWAIT); return ((*nam == NULL)? ENOMEM : 0); } /* ng_btsocket_sco_peeraddr */ /* * Send data to socket */ int ng_btsocket_sco_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, struct mbuf *control, struct thread *td) { ng_btsocket_sco_pcb_t *pcb = so2sco_pcb(so); int error = 0; if (ng_btsocket_sco_node == NULL) { error = ENETDOWN; goto drop; } /* Check socket and input */ if (pcb == NULL || m == NULL || control != NULL) { error = EINVAL; goto drop; } mtx_lock(&pcb->pcb_mtx); /* Make sure socket is connected */ if (pcb->state != NG_BTSOCKET_SCO_OPEN) { mtx_unlock(&pcb->pcb_mtx); error = ENOTCONN; goto drop; } /* Check route */ if (pcb->rt == NULL || pcb->rt->hook == NULL || NG_HOOK_NOT_VALID(pcb->rt->hook)) { mtx_unlock(&pcb->pcb_mtx); error = ENETDOWN; goto drop; } /* Check packet size */ if (m->m_pkthdr.len > pcb->rt->pkt_size) { NG_BTSOCKET_SCO_ERR( "%s: Packet too big, len=%d, pkt_size=%d\n", __func__, m->m_pkthdr.len, pcb->rt->pkt_size); mtx_unlock(&pcb->pcb_mtx); error = EMSGSIZE; goto drop; } /* * First put packet on socket send queue. Then check if we have * pending timeout. If we do not have timeout then we must send * packet and schedule timeout. Otherwise do nothing and wait for * NGM_HCI_SYNC_CON_QUEUE message. */ sbappendrecord(&pcb->so->so_snd, m); m = NULL; if (!(pcb->flags & NG_BTSOCKET_SCO_TIMO)) { error = ng_btsocket_sco_send2(pcb); if (error == 0) ng_btsocket_sco_timeout(pcb); else sbdroprecord(&pcb->so->so_snd); /* XXX */ } mtx_unlock(&pcb->pcb_mtx); drop: NG_FREE_M(m); /* checks for != NULL */ NG_FREE_M(control); return (error); } /* ng_btsocket_sco_send */ /* * Send first packet in the socket queue to the SCO layer */ static int ng_btsocket_sco_send2(ng_btsocket_sco_pcb_p pcb) { struct mbuf *m = NULL; ng_hci_scodata_pkt_t *hdr = NULL; int error = 0; mtx_assert(&pcb->pcb_mtx, MA_OWNED); while (pcb->rt->pending < pcb->rt->num_pkts && sbavail(&pcb->so->so_snd) > 0) { /* Get a copy of the first packet on send queue */ m = m_dup(pcb->so->so_snd.sb_mb, M_NOWAIT); if (m == NULL) { error = ENOBUFS; break; } /* Create SCO packet header */ M_PREPEND(m, sizeof(*hdr), M_NOWAIT); if (m != NULL) if (m->m_len < sizeof(*hdr)) m = m_pullup(m, sizeof(*hdr)); if (m == NULL) { error = ENOBUFS; break; } /* Fill in the header */ hdr = mtod(m, ng_hci_scodata_pkt_t *); hdr->type = NG_HCI_SCO_DATA_PKT; hdr->con_handle = htole16(NG_HCI_MK_CON_HANDLE(pcb->con_handle, 0, 0)); hdr->length = m->m_pkthdr.len - sizeof(*hdr); /* Send packet */ NG_SEND_DATA_ONLY(error, pcb->rt->hook, m); if (error != 0) break; pcb->rt->pending ++; } return ((pcb->rt->pending > 0)? 0 : error); } /* ng_btsocket_sco_send2 */ /* * Get socket address */ int ng_btsocket_sco_sockaddr(struct socket *so, struct sockaddr **nam) { ng_btsocket_sco_pcb_p pcb = so2sco_pcb(so); struct sockaddr_sco sa; if (pcb == NULL) return (EINVAL); if (ng_btsocket_sco_node == NULL) return (EINVAL); mtx_lock(&pcb->pcb_mtx); bcopy(&pcb->src, &sa.sco_bdaddr, sizeof(sa.sco_bdaddr)); mtx_unlock(&pcb->pcb_mtx); sa.sco_len = sizeof(sa); sa.sco_family = AF_BLUETOOTH; *nam = sodupsockaddr((struct sockaddr *) &sa, M_NOWAIT); return ((*nam == NULL)? ENOMEM : 0); } /* ng_btsocket_sco_sockaddr */ /***************************************************************************** ***************************************************************************** ** Misc. functions ***************************************************************************** *****************************************************************************/ /* * Look for the socket that listens on given bdaddr. * Returns exact or close match (if any). * Caller must hold ng_btsocket_sco_sockets_mtx. * Returns with locked pcb. */ static ng_btsocket_sco_pcb_p ng_btsocket_sco_pcb_by_addr(bdaddr_p bdaddr) { ng_btsocket_sco_pcb_p p = NULL, p1 = NULL; mtx_assert(&ng_btsocket_sco_sockets_mtx, MA_OWNED); LIST_FOREACH(p, &ng_btsocket_sco_sockets, next) { mtx_lock(&p->pcb_mtx); if (p->so == NULL || !(p->so->so_options & SO_ACCEPTCONN)) { mtx_unlock(&p->pcb_mtx); continue; } if (bcmp(&p->src, bdaddr, sizeof(p->src)) == 0) return (p); /* return with locked pcb */ if (bcmp(&p->src, NG_HCI_BDADDR_ANY, sizeof(p->src)) == 0) p1 = p; mtx_unlock(&p->pcb_mtx); } if (p1 != NULL) mtx_lock(&p1->pcb_mtx); return (p1); } /* ng_btsocket_sco_pcb_by_addr */ /* * Look for the socket that assigned to given source address and handle. * Caller must hold ng_btsocket_sco_sockets_mtx. * Returns with locked pcb. */ static ng_btsocket_sco_pcb_p ng_btsocket_sco_pcb_by_handle(bdaddr_p src, int con_handle) { ng_btsocket_sco_pcb_p p = NULL; mtx_assert(&ng_btsocket_sco_sockets_mtx, MA_OWNED); LIST_FOREACH(p, &ng_btsocket_sco_sockets, next) { mtx_lock(&p->pcb_mtx); if (p->con_handle == con_handle && bcmp(src, &p->src, sizeof(p->src)) == 0) return (p); /* return with locked pcb */ mtx_unlock(&p->pcb_mtx); } return (NULL); } /* ng_btsocket_sco_pcb_by_handle */ /* * Look for the socket in CONNECTING state with given source and destination * addresses. Caller must hold ng_btsocket_sco_sockets_mtx. * Returns with locked pcb. */ static ng_btsocket_sco_pcb_p ng_btsocket_sco_pcb_by_addrs(bdaddr_p src, bdaddr_p dst) { ng_btsocket_sco_pcb_p p = NULL; mtx_assert(&ng_btsocket_sco_sockets_mtx, MA_OWNED); LIST_FOREACH(p, &ng_btsocket_sco_sockets, next) { mtx_lock(&p->pcb_mtx); if (p->state == NG_BTSOCKET_SCO_CONNECTING && bcmp(src, &p->src, sizeof(p->src)) == 0 && bcmp(dst, &p->dst, sizeof(p->dst)) == 0) return (p); /* return with locked pcb */ mtx_unlock(&p->pcb_mtx); } return (NULL); } /* ng_btsocket_sco_pcb_by_addrs */ /* * Set timeout on socket */ static void ng_btsocket_sco_timeout(ng_btsocket_sco_pcb_p pcb) { mtx_assert(&pcb->pcb_mtx, MA_OWNED); if (!(pcb->flags & NG_BTSOCKET_SCO_TIMO)) { pcb->flags |= NG_BTSOCKET_SCO_TIMO; callout_reset(&pcb->timo, bluetooth_sco_rtx_timeout(), ng_btsocket_sco_process_timeout, pcb); } else KASSERT(0, ("%s: Duplicated socket timeout?!\n", __func__)); } /* ng_btsocket_sco_timeout */ /* * Unset timeout on socket */ static void ng_btsocket_sco_untimeout(ng_btsocket_sco_pcb_p pcb) { mtx_assert(&pcb->pcb_mtx, MA_OWNED); if (pcb->flags & NG_BTSOCKET_SCO_TIMO) { callout_stop(&pcb->timo); pcb->flags &= ~NG_BTSOCKET_SCO_TIMO; } else KASSERT(0, ("%s: No socket timeout?!\n", __func__)); } /* ng_btsocket_sco_untimeout */ /* * Process timeout on socket */ static void ng_btsocket_sco_process_timeout(void *xpcb) { ng_btsocket_sco_pcb_p pcb = (ng_btsocket_sco_pcb_p) xpcb; mtx_lock(&pcb->pcb_mtx); pcb->flags &= ~NG_BTSOCKET_SCO_TIMO; pcb->so->so_error = ETIMEDOUT; switch (pcb->state) { case NG_BTSOCKET_SCO_CONNECTING: /* Connect timeout - close the socket */ pcb->state = NG_BTSOCKET_SCO_CLOSED; soisdisconnected(pcb->so); break; case NG_BTSOCKET_SCO_OPEN: /* Send timeout - did not get NGM_HCI_SYNC_CON_QUEUE */ sbdroprecord(&pcb->so->so_snd); sowwakeup(pcb->so); /* XXX FIXME what to do with pcb->rt->pending??? */ break; case NG_BTSOCKET_SCO_DISCONNECTING: /* Disconnect timeout - disconnect the socket anyway */ pcb->state = NG_BTSOCKET_SCO_CLOSED; soisdisconnected(pcb->so); break; default: NG_BTSOCKET_SCO_ERR( "%s: Invalid socket state=%d\n", __func__, pcb->state); break; } mtx_unlock(&pcb->pcb_mtx); } /* ng_btsocket_sco_process_timeout */ Index: head/sys/netgraph/ng_ksocket.c =================================================================== --- head/sys/netgraph/ng_ksocket.c (revision 319721) +++ head/sys/netgraph/ng_ksocket.c (revision 319722) @@ -1,1316 +1,1273 @@ /* * ng_ksocket.c */ /*- * Copyright (c) 1996-1999 Whistle Communications, Inc. * All rights reserved. * * Subject to the following obligations and disclaimer of warranty, use and * redistribution of this software, in source or object code forms, with or * without modifications are expressly permitted by Whistle Communications; * provided, however, that: * 1. Any and all reproductions of the source or object code must include the * copyright notice above and the following disclaimer of warranties; and * 2. No rights are granted, in any manner or form, to use Whistle * Communications, Inc. trademarks, including the mark "WHISTLE * COMMUNICATIONS" on advertising, endorsements, or otherwise except as * such appears in the above copyright notice or in the software. * * THIS SOFTWARE IS BEING PROVIDED BY WHISTLE COMMUNICATIONS "AS IS", AND * TO THE MAXIMUM EXTENT PERMITTED BY LAW, WHISTLE COMMUNICATIONS MAKES NO * REPRESENTATIONS OR WARRANTIES, EXPRESS OR IMPLIED, REGARDING THIS SOFTWARE, * INCLUDING WITHOUT LIMITATION, ANY AND ALL IMPLIED WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT. * WHISTLE COMMUNICATIONS DOES NOT WARRANT, GUARANTEE, OR MAKE ANY * REPRESENTATIONS REGARDING THE USE OF, OR THE RESULTS OF THE USE OF THIS * SOFTWARE IN TERMS OF ITS CORRECTNESS, ACCURACY, RELIABILITY OR OTHERWISE. * IN NO EVENT SHALL WHISTLE COMMUNICATIONS BE LIABLE FOR ANY DAMAGES * RESULTING FROM OR ARISING OUT OF ANY USE OF THIS SOFTWARE, INCLUDING * WITHOUT LIMITATION, ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, * PUNITIVE, OR CONSEQUENTIAL DAMAGES, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES, LOSS OF USE, DATA OR PROFITS, HOWEVER CAUSED AND UNDER ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF WHISTLE COMMUNICATIONS IS ADVISED OF THE POSSIBILITY * OF SUCH DAMAGE. * * Author: Archie Cobbs * * $FreeBSD$ * $Whistle: ng_ksocket.c,v 1.1 1999/11/16 20:04:40 archie Exp $ */ /* * Kernel socket node type. This node type is basically a kernel-mode * version of a socket... kindof like the reverse of the socket node type. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef NG_SEPARATE_MALLOC static MALLOC_DEFINE(M_NETGRAPH_KSOCKET, "netgraph_ksock", "netgraph ksock node"); #else #define M_NETGRAPH_KSOCKET M_NETGRAPH #endif #define OFFSETOF(s, e) ((char *)&((s *)0)->e - (char *)((s *)0)) #define SADATA_OFFSET (OFFSETOF(struct sockaddr, sa_data)) /* Node private data */ struct ng_ksocket_private { node_p node; hook_p hook; struct socket *so; int fn_sent; /* FN call on incoming event was sent */ LIST_HEAD(, ng_ksocket_private) embryos; LIST_ENTRY(ng_ksocket_private) siblings; u_int32_t flags; u_int32_t response_token; ng_ID_t response_addr; }; typedef struct ng_ksocket_private *priv_p; /* Flags for priv_p */ #define KSF_CONNECTING 0x00000001 /* Waiting for connection complete */ #define KSF_ACCEPTING 0x00000002 /* Waiting for accept complete */ #define KSF_EOFSEEN 0x00000004 /* Have sent 0-length EOF mbuf */ #define KSF_CLONED 0x00000008 /* Cloned from an accepting socket */ #define KSF_EMBRYONIC 0x00000010 /* Cloned node with no hooks yet */ /* Netgraph node methods */ static ng_constructor_t ng_ksocket_constructor; static ng_rcvmsg_t ng_ksocket_rcvmsg; static ng_shutdown_t ng_ksocket_shutdown; static ng_newhook_t ng_ksocket_newhook; static ng_rcvdata_t ng_ksocket_rcvdata; static ng_connect_t ng_ksocket_connect; static ng_disconnect_t ng_ksocket_disconnect; /* Alias structure */ struct ng_ksocket_alias { const char *name; const int value; const int family; }; /* Protocol family aliases */ static const struct ng_ksocket_alias ng_ksocket_families[] = { { "local", PF_LOCAL }, { "inet", PF_INET }, { "inet6", PF_INET6 }, { "atm", PF_ATM }, { NULL, -1 }, }; /* Socket type aliases */ static const struct ng_ksocket_alias ng_ksocket_types[] = { { "stream", SOCK_STREAM }, { "dgram", SOCK_DGRAM }, { "raw", SOCK_RAW }, { "rdm", SOCK_RDM }, { "seqpacket", SOCK_SEQPACKET }, { NULL, -1 }, }; /* Protocol aliases */ static const struct ng_ksocket_alias ng_ksocket_protos[] = { { "ip", IPPROTO_IP, PF_INET }, { "raw", IPPROTO_RAW, PF_INET }, { "icmp", IPPROTO_ICMP, PF_INET }, { "igmp", IPPROTO_IGMP, PF_INET }, { "tcp", IPPROTO_TCP, PF_INET }, { "udp", IPPROTO_UDP, PF_INET }, { "gre", IPPROTO_GRE, PF_INET }, { "esp", IPPROTO_ESP, PF_INET }, { "ah", IPPROTO_AH, PF_INET }, { "swipe", IPPROTO_SWIPE, PF_INET }, { "encap", IPPROTO_ENCAP, PF_INET }, { "divert", IPPROTO_DIVERT, PF_INET }, { "pim", IPPROTO_PIM, PF_INET }, { NULL, -1 }, }; /* Helper functions */ -static int ng_ksocket_check_accept(priv_p); -static void ng_ksocket_finish_accept(priv_p); +static int ng_ksocket_accept(priv_p); static int ng_ksocket_incoming(struct socket *so, void *arg, int waitflag); static int ng_ksocket_parse(const struct ng_ksocket_alias *aliases, const char *s, int family); static void ng_ksocket_incoming2(node_p node, hook_p hook, void *arg1, int arg2); /************************************************************************ STRUCT SOCKADDR PARSE TYPE ************************************************************************/ /* Get the length of the data portion of a generic struct sockaddr */ static int ng_parse_generic_sockdata_getLength(const struct ng_parse_type *type, const u_char *start, const u_char *buf) { const struct sockaddr *sa; sa = (const struct sockaddr *)(buf - SADATA_OFFSET); return (sa->sa_len < SADATA_OFFSET) ? 0 : sa->sa_len - SADATA_OFFSET; } /* Type for the variable length data portion of a generic struct sockaddr */ static const struct ng_parse_type ng_ksocket_generic_sockdata_type = { &ng_parse_bytearray_type, &ng_parse_generic_sockdata_getLength }; /* Type for a generic struct sockaddr */ static const struct ng_parse_struct_field ng_parse_generic_sockaddr_type_fields[] = { { "len", &ng_parse_uint8_type }, { "family", &ng_parse_uint8_type }, { "data", &ng_ksocket_generic_sockdata_type }, { NULL } }; static const struct ng_parse_type ng_ksocket_generic_sockaddr_type = { &ng_parse_struct_type, &ng_parse_generic_sockaddr_type_fields }; /* Convert a struct sockaddr from ASCII to binary. If its a protocol family that we specially handle, do that, otherwise defer to the generic parse type ng_ksocket_generic_sockaddr_type. */ static int ng_ksocket_sockaddr_parse(const struct ng_parse_type *type, const char *s, int *off, const u_char *const start, u_char *const buf, int *buflen) { struct sockaddr *const sa = (struct sockaddr *)buf; enum ng_parse_token tok; char fambuf[32]; int family, len; char *t; /* If next token is a left curly brace, use generic parse type */ if ((tok = ng_parse_get_token(s, off, &len)) == T_LBRACE) { return (*ng_ksocket_generic_sockaddr_type.supertype->parse) (&ng_ksocket_generic_sockaddr_type, s, off, start, buf, buflen); } /* Get socket address family followed by a slash */ while (isspace(s[*off])) (*off)++; if ((t = strchr(s + *off, '/')) == NULL) return (EINVAL); if ((len = t - (s + *off)) > sizeof(fambuf) - 1) return (EINVAL); strncpy(fambuf, s + *off, len); fambuf[len] = '\0'; *off += len + 1; if ((family = ng_ksocket_parse(ng_ksocket_families, fambuf, 0)) == -1) return (EINVAL); /* Set family */ if (*buflen < SADATA_OFFSET) return (ERANGE); sa->sa_family = family; /* Set family-specific data and length */ switch (sa->sa_family) { case PF_LOCAL: /* Get pathname */ { const int pathoff = OFFSETOF(struct sockaddr_un, sun_path); struct sockaddr_un *const sun = (struct sockaddr_un *)sa; int toklen, pathlen; char *path; if ((path = ng_get_string_token(s, off, &toklen, NULL)) == NULL) return (EINVAL); pathlen = strlen(path); if (pathlen > SOCK_MAXADDRLEN) { free(path, M_NETGRAPH_KSOCKET); return (E2BIG); } if (*buflen < pathoff + pathlen) { free(path, M_NETGRAPH_KSOCKET); return (ERANGE); } *off += toklen; bcopy(path, sun->sun_path, pathlen); sun->sun_len = pathoff + pathlen; free(path, M_NETGRAPH_KSOCKET); break; } case PF_INET: /* Get an IP address with optional port */ { struct sockaddr_in *const sin = (struct sockaddr_in *)sa; int i; /* Parse this: [:port] */ for (i = 0; i < 4; i++) { u_long val; char *eptr; val = strtoul(s + *off, &eptr, 10); if (val > 0xff || eptr == s + *off) return (EINVAL); *off += (eptr - (s + *off)); ((u_char *)&sin->sin_addr)[i] = (u_char)val; if (i < 3) { if (s[*off] != '.') return (EINVAL); (*off)++; } else if (s[*off] == ':') { (*off)++; val = strtoul(s + *off, &eptr, 10); if (val > 0xffff || eptr == s + *off) return (EINVAL); *off += (eptr - (s + *off)); sin->sin_port = htons(val); } else sin->sin_port = 0; } bzero(&sin->sin_zero, sizeof(sin->sin_zero)); sin->sin_len = sizeof(*sin); break; } #if 0 case PF_INET6: /* XXX implement this someday */ #endif default: return (EINVAL); } /* Done */ *buflen = sa->sa_len; return (0); } /* Convert a struct sockaddr from binary to ASCII */ static int ng_ksocket_sockaddr_unparse(const struct ng_parse_type *type, const u_char *data, int *off, char *cbuf, int cbuflen) { const struct sockaddr *sa = (const struct sockaddr *)(data + *off); int slen = 0; /* Output socket address, either in special or generic format */ switch (sa->sa_family) { case PF_LOCAL: { const int pathoff = OFFSETOF(struct sockaddr_un, sun_path); const struct sockaddr_un *sun = (const struct sockaddr_un *)sa; const int pathlen = sun->sun_len - pathoff; char pathbuf[SOCK_MAXADDRLEN + 1]; char *pathtoken; bcopy(sun->sun_path, pathbuf, pathlen); if ((pathtoken = ng_encode_string(pathbuf, pathlen)) == NULL) return (ENOMEM); slen += snprintf(cbuf, cbuflen, "local/%s", pathtoken); free(pathtoken, M_NETGRAPH_KSOCKET); if (slen >= cbuflen) return (ERANGE); *off += sun->sun_len; return (0); } case PF_INET: { const struct sockaddr_in *sin = (const struct sockaddr_in *)sa; slen += snprintf(cbuf, cbuflen, "inet/%d.%d.%d.%d", ((const u_char *)&sin->sin_addr)[0], ((const u_char *)&sin->sin_addr)[1], ((const u_char *)&sin->sin_addr)[2], ((const u_char *)&sin->sin_addr)[3]); if (sin->sin_port != 0) { slen += snprintf(cbuf + strlen(cbuf), cbuflen - strlen(cbuf), ":%d", (u_int)ntohs(sin->sin_port)); } if (slen >= cbuflen) return (ERANGE); *off += sizeof(*sin); return(0); } #if 0 case PF_INET6: /* XXX implement this someday */ #endif default: return (*ng_ksocket_generic_sockaddr_type.supertype->unparse) (&ng_ksocket_generic_sockaddr_type, data, off, cbuf, cbuflen); } } /* Parse type for struct sockaddr */ static const struct ng_parse_type ng_ksocket_sockaddr_type = { NULL, NULL, NULL, &ng_ksocket_sockaddr_parse, &ng_ksocket_sockaddr_unparse, NULL /* no such thing as a default struct sockaddr */ }; /************************************************************************ STRUCT NG_KSOCKET_SOCKOPT PARSE TYPE ************************************************************************/ /* Get length of the struct ng_ksocket_sockopt value field, which is the just the excess of the message argument portion over the length of the struct ng_ksocket_sockopt. */ static int ng_parse_sockoptval_getLength(const struct ng_parse_type *type, const u_char *start, const u_char *buf) { static const int offset = OFFSETOF(struct ng_ksocket_sockopt, value); const struct ng_ksocket_sockopt *sopt; const struct ng_mesg *msg; sopt = (const struct ng_ksocket_sockopt *)(buf - offset); msg = (const struct ng_mesg *)((const u_char *)sopt - sizeof(*msg)); return msg->header.arglen - sizeof(*sopt); } /* Parse type for the option value part of a struct ng_ksocket_sockopt XXX Eventually, we should handle the different socket options specially. XXX This would avoid byte order problems, eg an integer value of 1 is XXX going to be "[1]" for little endian or "[3=1]" for big endian. */ static const struct ng_parse_type ng_ksocket_sockoptval_type = { &ng_parse_bytearray_type, &ng_parse_sockoptval_getLength }; /* Parse type for struct ng_ksocket_sockopt */ static const struct ng_parse_struct_field ng_ksocket_sockopt_type_fields[] = NG_KSOCKET_SOCKOPT_INFO(&ng_ksocket_sockoptval_type); static const struct ng_parse_type ng_ksocket_sockopt_type = { &ng_parse_struct_type, &ng_ksocket_sockopt_type_fields }; /* Parse type for struct ng_ksocket_accept */ static const struct ng_parse_struct_field ng_ksocket_accept_type_fields[] = NGM_KSOCKET_ACCEPT_INFO; static const struct ng_parse_type ng_ksocket_accept_type = { &ng_parse_struct_type, &ng_ksocket_accept_type_fields }; /* List of commands and how to convert arguments to/from ASCII */ static const struct ng_cmdlist ng_ksocket_cmds[] = { { NGM_KSOCKET_COOKIE, NGM_KSOCKET_BIND, "bind", &ng_ksocket_sockaddr_type, NULL }, { NGM_KSOCKET_COOKIE, NGM_KSOCKET_LISTEN, "listen", &ng_parse_int32_type, NULL }, { NGM_KSOCKET_COOKIE, NGM_KSOCKET_ACCEPT, "accept", NULL, &ng_ksocket_accept_type }, { NGM_KSOCKET_COOKIE, NGM_KSOCKET_CONNECT, "connect", &ng_ksocket_sockaddr_type, &ng_parse_int32_type }, { NGM_KSOCKET_COOKIE, NGM_KSOCKET_GETNAME, "getname", NULL, &ng_ksocket_sockaddr_type }, { NGM_KSOCKET_COOKIE, NGM_KSOCKET_GETPEERNAME, "getpeername", NULL, &ng_ksocket_sockaddr_type }, { NGM_KSOCKET_COOKIE, NGM_KSOCKET_SETOPT, "setopt", &ng_ksocket_sockopt_type, NULL }, { NGM_KSOCKET_COOKIE, NGM_KSOCKET_GETOPT, "getopt", &ng_ksocket_sockopt_type, &ng_ksocket_sockopt_type }, { 0 } }; /* Node type descriptor */ static struct ng_type ng_ksocket_typestruct = { .version = NG_ABI_VERSION, .name = NG_KSOCKET_NODE_TYPE, .constructor = ng_ksocket_constructor, .rcvmsg = ng_ksocket_rcvmsg, .shutdown = ng_ksocket_shutdown, .newhook = ng_ksocket_newhook, .connect = ng_ksocket_connect, .rcvdata = ng_ksocket_rcvdata, .disconnect = ng_ksocket_disconnect, .cmdlist = ng_ksocket_cmds, }; NETGRAPH_INIT(ksocket, &ng_ksocket_typestruct); #define ERROUT(x) do { error = (x); goto done; } while (0) /************************************************************************ NETGRAPH NODE STUFF ************************************************************************/ /* * Node type constructor * The NODE part is assumed to be all set up. * There is already a reference to the node for us. */ static int ng_ksocket_constructor(node_p node) { priv_p priv; /* Allocate private structure */ priv = malloc(sizeof(*priv), M_NETGRAPH_KSOCKET, M_NOWAIT | M_ZERO); if (priv == NULL) return (ENOMEM); LIST_INIT(&priv->embryos); /* cross link them */ priv->node = node; NG_NODE_SET_PRIVATE(node, priv); /* Done */ return (0); } /* * Give our OK for a hook to be added. The hook name is of the * form "//" where the three components may * be decimal numbers or else aliases from the above lists. * * Connecting a hook amounts to opening the socket. Disconnecting * the hook closes the socket and destroys the node as well. */ static int ng_ksocket_newhook(node_p node, hook_p hook, const char *name0) { struct thread *td = curthread; /* XXX broken */ const priv_p priv = NG_NODE_PRIVATE(node); char *s1, *s2, name[NG_HOOKSIZ]; int family, type, protocol, error; /* Check if we're already connected */ if (priv->hook != NULL) return (EISCONN); if (priv->flags & KSF_CLONED) { if (priv->flags & KSF_EMBRYONIC) { /* Remove ourselves from our parent's embryo list */ LIST_REMOVE(priv, siblings); priv->flags &= ~KSF_EMBRYONIC; } } else { /* Extract family, type, and protocol from hook name */ snprintf(name, sizeof(name), "%s", name0); s1 = name; if ((s2 = strchr(s1, '/')) == NULL) return (EINVAL); *s2++ = '\0'; family = ng_ksocket_parse(ng_ksocket_families, s1, 0); if (family == -1) return (EINVAL); s1 = s2; if ((s2 = strchr(s1, '/')) == NULL) return (EINVAL); *s2++ = '\0'; type = ng_ksocket_parse(ng_ksocket_types, s1, 0); if (type == -1) return (EINVAL); s1 = s2; protocol = ng_ksocket_parse(ng_ksocket_protos, s1, family); if (protocol == -1) return (EINVAL); /* Create the socket */ error = socreate(family, &priv->so, type, protocol, td->td_ucred, td); if (error != 0) return (error); /* XXX call soreserve() ? */ } /* OK */ priv->hook = hook; /* * In case of misconfigured routing a packet may reenter * ksocket node recursively. Decouple stack to avoid possible * panics about sleeping with locks held. */ NG_HOOK_FORCE_QUEUE(hook); return(0); } static int ng_ksocket_connect(hook_p hook) { node_p node = NG_HOOK_NODE(hook); const priv_p priv = NG_NODE_PRIVATE(node); struct socket *const so = priv->so; /* Add our hook for incoming data and other events */ SOCKBUF_LOCK(&priv->so->so_rcv); soupcall_set(priv->so, SO_RCV, ng_ksocket_incoming, node); SOCKBUF_UNLOCK(&priv->so->so_rcv); SOCKBUF_LOCK(&priv->so->so_snd); soupcall_set(priv->so, SO_SND, ng_ksocket_incoming, node); SOCKBUF_UNLOCK(&priv->so->so_snd); SOCK_LOCK(priv->so); priv->so->so_state |= SS_NBIO; SOCK_UNLOCK(priv->so); /* * --Original comment-- * On a cloned socket we may have already received one or more * upcalls which we couldn't handle without a hook. Handle * those now. * We cannot call the upcall function directly * from here, because until this function has returned our * hook isn't connected. * * ---meta comment for -current --- * XXX This is dubius. * Upcalls between the time that the hook was * first created and now (on another processesor) will * be earlier on the queue than the request to finalise the hook. * By the time the hook is finalised, * The queued upcalls will have happened and the code * will have discarded them because of a lack of a hook. * (socket not open). * * This is a bad byproduct of the complicated way in which hooks * are now created (3 daisy chained async events). * * Since we are a netgraph operation * We know that we hold a lock on this node. This forces the * request we make below to be queued rather than implemented * immediately which will cause the upcall function to be called a bit * later. * However, as we will run any waiting queued operations immediately * after doing this one, if we have not finalised the other end * of the hook, those queued operations will fail. */ if (priv->flags & KSF_CLONED) { ng_send_fn(node, NULL, &ng_ksocket_incoming2, so, M_NOWAIT); } return (0); } /* * Receive a control message */ static int ng_ksocket_rcvmsg(node_p node, item_p item, hook_p lasthook) { struct thread *td = curthread; /* XXX broken */ const priv_p priv = NG_NODE_PRIVATE(node); struct socket *const so = priv->so; struct ng_mesg *resp = NULL; int error = 0; struct ng_mesg *msg; ng_ID_t raddr; NGI_GET_MSG(item, msg); switch (msg->header.typecookie) { case NGM_KSOCKET_COOKIE: switch (msg->header.cmd) { case NGM_KSOCKET_BIND: { struct sockaddr *const sa = (struct sockaddr *)msg->data; /* Sanity check */ if (msg->header.arglen < SADATA_OFFSET || msg->header.arglen < sa->sa_len) ERROUT(EINVAL); if (so == NULL) ERROUT(ENXIO); /* Bind */ error = sobind(so, sa, td); break; } case NGM_KSOCKET_LISTEN: { /* Sanity check */ if (msg->header.arglen != sizeof(int32_t)) ERROUT(EINVAL); if (so == NULL) ERROUT(ENXIO); /* Listen */ + so->so_state |= SS_NBIO; error = solisten(so, *((int32_t *)msg->data), td); break; } case NGM_KSOCKET_ACCEPT: { /* Sanity check */ if (msg->header.arglen != 0) ERROUT(EINVAL); if (so == NULL) ERROUT(ENXIO); /* Make sure the socket is capable of accepting */ if (!(so->so_options & SO_ACCEPTCONN)) ERROUT(EINVAL); if (priv->flags & KSF_ACCEPTING) ERROUT(EALREADY); - error = ng_ksocket_check_accept(priv); - if (error != 0 && error != EWOULDBLOCK) - ERROUT(error); - /* * If a connection is already complete, take it. * Otherwise let the upcall function deal with * the connection when it comes in. */ + error = ng_ksocket_accept(priv); + if (error != 0 && error != EWOULDBLOCK) + ERROUT(error); priv->response_token = msg->header.token; raddr = priv->response_addr = NGI_RETADDR(item); - if (error == 0) { - ng_ksocket_finish_accept(priv); - } else - priv->flags |= KSF_ACCEPTING; break; } case NGM_KSOCKET_CONNECT: { struct sockaddr *const sa = (struct sockaddr *)msg->data; /* Sanity check */ if (msg->header.arglen < SADATA_OFFSET || msg->header.arglen < sa->sa_len) ERROUT(EINVAL); if (so == NULL) ERROUT(ENXIO); /* Do connect */ if ((so->so_state & SS_ISCONNECTING) != 0) ERROUT(EALREADY); if ((error = soconnect(so, sa, td)) != 0) { so->so_state &= ~SS_ISCONNECTING; ERROUT(error); } if ((so->so_state & SS_ISCONNECTING) != 0) { /* We will notify the sender when we connect */ priv->response_token = msg->header.token; raddr = priv->response_addr = NGI_RETADDR(item); priv->flags |= KSF_CONNECTING; ERROUT(EINPROGRESS); } break; } case NGM_KSOCKET_GETNAME: case NGM_KSOCKET_GETPEERNAME: { int (*func)(struct socket *so, struct sockaddr **nam); struct sockaddr *sa = NULL; int len; /* Sanity check */ if (msg->header.arglen != 0) ERROUT(EINVAL); if (so == NULL) ERROUT(ENXIO); /* Get function */ if (msg->header.cmd == NGM_KSOCKET_GETPEERNAME) { if ((so->so_state & (SS_ISCONNECTED|SS_ISCONFIRMING)) == 0) ERROUT(ENOTCONN); func = so->so_proto->pr_usrreqs->pru_peeraddr; } else func = so->so_proto->pr_usrreqs->pru_sockaddr; /* Get local or peer address */ if ((error = (*func)(so, &sa)) != 0) goto bail; len = (sa == NULL) ? 0 : sa->sa_len; /* Send it back in a response */ NG_MKRESPONSE(resp, msg, len, M_NOWAIT); if (resp == NULL) { error = ENOMEM; goto bail; } bcopy(sa, resp->data, len); bail: /* Cleanup */ if (sa != NULL) free(sa, M_SONAME); break; } case NGM_KSOCKET_GETOPT: { struct ng_ksocket_sockopt *ksopt = (struct ng_ksocket_sockopt *)msg->data; struct sockopt sopt; /* Sanity check */ if (msg->header.arglen != sizeof(*ksopt)) ERROUT(EINVAL); if (so == NULL) ERROUT(ENXIO); /* Get response with room for option value */ NG_MKRESPONSE(resp, msg, sizeof(*ksopt) + NG_KSOCKET_MAX_OPTLEN, M_NOWAIT); if (resp == NULL) ERROUT(ENOMEM); /* Get socket option, and put value in the response */ sopt.sopt_dir = SOPT_GET; sopt.sopt_level = ksopt->level; sopt.sopt_name = ksopt->name; sopt.sopt_td = NULL; sopt.sopt_valsize = NG_KSOCKET_MAX_OPTLEN; ksopt = (struct ng_ksocket_sockopt *)resp->data; sopt.sopt_val = ksopt->value; if ((error = sogetopt(so, &sopt)) != 0) { NG_FREE_MSG(resp); break; } /* Set actual value length */ resp->header.arglen = sizeof(*ksopt) + sopt.sopt_valsize; break; } case NGM_KSOCKET_SETOPT: { struct ng_ksocket_sockopt *const ksopt = (struct ng_ksocket_sockopt *)msg->data; const int valsize = msg->header.arglen - sizeof(*ksopt); struct sockopt sopt; /* Sanity check */ if (valsize < 0) ERROUT(EINVAL); if (so == NULL) ERROUT(ENXIO); /* Set socket option */ sopt.sopt_dir = SOPT_SET; sopt.sopt_level = ksopt->level; sopt.sopt_name = ksopt->name; sopt.sopt_val = ksopt->value; sopt.sopt_valsize = valsize; sopt.sopt_td = NULL; error = sosetopt(so, &sopt); break; } default: error = EINVAL; break; } break; default: error = EINVAL; break; } done: NG_RESPOND_MSG(error, node, item, resp); NG_FREE_MSG(msg); return (error); } /* * Receive incoming data on our hook. Send it out the socket. */ static int ng_ksocket_rcvdata(hook_p hook, item_p item) { struct thread *td = curthread; /* XXX broken */ const node_p node = NG_HOOK_NODE(hook); const priv_p priv = NG_NODE_PRIVATE(node); struct socket *const so = priv->so; struct sockaddr *sa = NULL; int error; struct mbuf *m; #ifdef ALIGNED_POINTER struct mbuf *n; #endif /* ALIGNED_POINTER */ struct sa_tag *stag; /* Extract data */ NGI_GET_M(item, m); NG_FREE_ITEM(item); #ifdef ALIGNED_POINTER if (!ALIGNED_POINTER(mtod(m, caddr_t), uint32_t)) { n = m_defrag(m, M_NOWAIT); if (n == NULL) { m_freem(m); return (ENOBUFS); } m = n; } #endif /* ALIGNED_POINTER */ /* * Look if socket address is stored in packet tags. * If sockaddr is ours, or provided by a third party (zero id), * then we accept it. */ if (((stag = (struct sa_tag *)m_tag_locate(m, NGM_KSOCKET_COOKIE, NG_KSOCKET_TAG_SOCKADDR, NULL)) != NULL) && (stag->id == NG_NODE_ID(node) || stag->id == 0)) sa = &stag->sa; /* Reset specific mbuf flags to prevent addressing problems. */ m->m_flags &= ~(M_BCAST|M_MCAST); /* Send packet */ error = sosend(so, sa, 0, m, 0, 0, td); return (error); } /* * Destroy node */ static int ng_ksocket_shutdown(node_p node) { const priv_p priv = NG_NODE_PRIVATE(node); priv_p embryo; /* Close our socket (if any) */ if (priv->so != NULL) { SOCKBUF_LOCK(&priv->so->so_rcv); soupcall_clear(priv->so, SO_RCV); SOCKBUF_UNLOCK(&priv->so->so_rcv); SOCKBUF_LOCK(&priv->so->so_snd); soupcall_clear(priv->so, SO_SND); SOCKBUF_UNLOCK(&priv->so->so_snd); soclose(priv->so); priv->so = NULL; } /* If we are an embryo, take ourselves out of the parent's list */ if (priv->flags & KSF_EMBRYONIC) { LIST_REMOVE(priv, siblings); priv->flags &= ~KSF_EMBRYONIC; } /* Remove any embryonic children we have */ while (!LIST_EMPTY(&priv->embryos)) { embryo = LIST_FIRST(&priv->embryos); ng_rmnode_self(embryo->node); } /* Take down netgraph node */ bzero(priv, sizeof(*priv)); free(priv, M_NETGRAPH_KSOCKET); NG_NODE_SET_PRIVATE(node, NULL); NG_NODE_UNREF(node); /* let the node escape */ return (0); } /* * Hook disconnection */ static int ng_ksocket_disconnect(hook_p hook) { KASSERT(NG_NODE_NUMHOOKS(NG_HOOK_NODE(hook)) == 0, ("%s: numhooks=%d?", __func__, NG_NODE_NUMHOOKS(NG_HOOK_NODE(hook)))); if (NG_NODE_IS_VALID(NG_HOOK_NODE(hook))) ng_rmnode_self(NG_HOOK_NODE(hook)); return (0); } /************************************************************************ HELPER STUFF ************************************************************************/ /* * You should not "just call" a netgraph node function from an external * asynchronous event. This is because in doing so you are ignoring the * locking on the netgraph nodes. Instead call your function via ng_send_fn(). * This will call the function you chose, but will first do all the * locking rigmarole. Your function MAY only be called at some distant future * time (several millisecs away) so don't give it any arguments * that may be revoked soon (e.g. on your stack). * * To decouple stack, we use queue version of ng_send_fn(). */ static int ng_ksocket_incoming(struct socket *so, void *arg, int waitflag) { const node_p node = arg; const priv_p priv = NG_NODE_PRIVATE(node); int wait = ((waitflag & M_WAITOK) ? NG_WAITOK : 0) | NG_QUEUE; /* * Even if node is not locked, as soon as we are called, we assume * it exist and it's private area is valid. With some care we can * access it. Mark node that incoming event for it was sent to * avoid unneded queue trashing. */ if (atomic_cmpset_int(&priv->fn_sent, 0, 1) && ng_send_fn1(node, NULL, &ng_ksocket_incoming2, so, 0, wait)) { atomic_store_rel_int(&priv->fn_sent, 0); } return (SU_OK); } /* * When incoming data is appended to the socket, we get notified here. * This is also called whenever a significant event occurs for the socket. * Our original caller may have queued this even some time ago and * we cannot trust that he even still exists. The node however is being * held with a reference by the queueing code and guarantied to be valid. */ static void ng_ksocket_incoming2(node_p node, hook_p hook, void *arg1, int arg2) { struct socket *so = arg1; const priv_p priv = NG_NODE_PRIVATE(node); struct ng_mesg *response; int error; KASSERT(so == priv->so, ("%s: wrong socket", __func__)); /* Allow next incoming event to be queued. */ atomic_store_rel_int(&priv->fn_sent, 0); /* Check whether a pending connect operation has completed */ if (priv->flags & KSF_CONNECTING) { if ((error = so->so_error) != 0) { so->so_error = 0; so->so_state &= ~SS_ISCONNECTING; } if (!(so->so_state & SS_ISCONNECTING)) { NG_MKMESSAGE(response, NGM_KSOCKET_COOKIE, NGM_KSOCKET_CONNECT, sizeof(int32_t), M_NOWAIT); if (response != NULL) { response->header.flags |= NGF_RESP; response->header.token = priv->response_token; *(int32_t *)response->data = error; /* * send an async "response" message * to the node that set us up * (if it still exists) */ NG_SEND_MSG_ID(error, node, response, priv->response_addr, 0); } priv->flags &= ~KSF_CONNECTING; } } /* Check whether a pending accept operation has completed */ - if (priv->flags & KSF_ACCEPTING) { - error = ng_ksocket_check_accept(priv); - if (error != EWOULDBLOCK) - priv->flags &= ~KSF_ACCEPTING; - if (error == 0) - ng_ksocket_finish_accept(priv); - } + if (priv->flags & KSF_ACCEPTING) + (void )ng_ksocket_accept(priv); /* * If we don't have a hook, we must handle data events later. When * the hook gets created and is connected, this upcall function * will be called again. */ if (priv->hook == NULL) return; /* Read and forward available mbufs. */ while (1) { struct uio uio; struct sockaddr *sa; struct mbuf *m; int flags; /* Try to get next packet from socket. */ uio.uio_td = NULL; uio.uio_resid = IP_MAXPACKET; flags = MSG_DONTWAIT; sa = NULL; if ((error = soreceive(so, (so->so_state & SS_ISCONNECTED) ? NULL : &sa, &uio, &m, NULL, &flags)) != 0) break; /* See if we got anything. */ if (flags & MSG_TRUNC) { m_freem(m); m = NULL; } if (m == NULL) { if (sa != NULL) free(sa, M_SONAME); break; } KASSERT(m->m_nextpkt == NULL, ("%s: nextpkt", __func__)); /* * Stream sockets do not have packet boundaries, so * we have to allocate a header mbuf and attach the * stream of data to it. */ if (so->so_type == SOCK_STREAM) { struct mbuf *mh; mh = m_gethdr(M_NOWAIT, MT_DATA); if (mh == NULL) { m_freem(m); if (sa != NULL) free(sa, M_SONAME); break; } mh->m_next = m; for (; m; m = m->m_next) mh->m_pkthdr.len += m->m_len; m = mh; } /* Put peer's socket address (if any) into a tag */ if (sa != NULL) { struct sa_tag *stag; stag = (struct sa_tag *)m_tag_alloc(NGM_KSOCKET_COOKIE, NG_KSOCKET_TAG_SOCKADDR, sizeof(ng_ID_t) + sa->sa_len, M_NOWAIT); if (stag == NULL) { free(sa, M_SONAME); goto sendit; } bcopy(sa, &stag->sa, sa->sa_len); free(sa, M_SONAME); stag->id = NG_NODE_ID(node); m_tag_prepend(m, &stag->tag); } sendit: /* Forward data with optional peer sockaddr as packet tag */ NG_SEND_DATA_ONLY(error, priv->hook, m); } /* * If the peer has closed the connection, forward a 0-length mbuf * to indicate end-of-file. */ if (so->so_rcv.sb_state & SBS_CANTRCVMORE && !(priv->flags & KSF_EOFSEEN)) { struct mbuf *m; m = m_gethdr(M_NOWAIT, MT_DATA); if (m != NULL) NG_SEND_DATA_ONLY(error, priv->hook, m); priv->flags |= KSF_EOFSEEN; } } -/* - * Check for a completed incoming connection and return 0 if one is found. - * Otherwise return the appropriate error code. - */ static int -ng_ksocket_check_accept(priv_p priv) +ng_ksocket_accept(priv_p priv) { struct socket *const head = priv->so; - int error; - - if ((error = head->so_error) != 0) { - head->so_error = 0; - return error; - } - /* Unlocked read. */ - if (TAILQ_EMPTY(&head->so_comp)) { - if (head->so_rcv.sb_state & SBS_CANTRCVMORE) - return ECONNABORTED; - return EWOULDBLOCK; - } - return 0; -} - -/* - * Handle the first completed incoming connection, assumed to be already - * on the socket's so_comp queue. - */ -static void -ng_ksocket_finish_accept(priv_p priv) -{ - struct socket *const head = priv->so; struct socket *so; struct sockaddr *sa = NULL; struct ng_mesg *resp; struct ng_ksocket_accept *resp_data; node_p node; priv_p priv2; int len; int error; - ACCEPT_LOCK(); - so = TAILQ_FIRST(&head->so_comp); - if (so == NULL) { /* Should never happen */ - ACCEPT_UNLOCK(); - return; + SOLISTEN_LOCK(head); + error = solisten_dequeue(head, &so, SOCK_NONBLOCK); + if (error == EWOULDBLOCK) { + priv->flags |= KSF_ACCEPTING; + return (error); } - TAILQ_REMOVE(&head->so_comp, so, so_list); - head->so_qlen--; - so->so_qstate &= ~SQ_COMP; - so->so_head = NULL; - SOCK_LOCK(so); - soref(so); - so->so_state |= SS_NBIO; - SOCK_UNLOCK(so); - ACCEPT_UNLOCK(); + priv->flags &= ~KSF_ACCEPTING; + if (error) + return (error); - /* XXX KNOTE_UNLOCKED(&head->so_rcv.sb_sel.si_note, 0); */ - soaccept(so, &sa); len = OFFSETOF(struct ng_ksocket_accept, addr); if (sa != NULL) len += sa->sa_len; NG_MKMESSAGE(resp, NGM_KSOCKET_COOKIE, NGM_KSOCKET_ACCEPT, len, M_NOWAIT); if (resp == NULL) { soclose(so); goto out; } resp->header.flags |= NGF_RESP; resp->header.token = priv->response_token; /* Clone a ksocket node to wrap the new socket */ error = ng_make_node_common(&ng_ksocket_typestruct, &node); if (error) { free(resp, M_NETGRAPH); soclose(so); goto out; } if (ng_ksocket_constructor(node) != 0) { NG_NODE_UNREF(node); free(resp, M_NETGRAPH); soclose(so); goto out; } priv2 = NG_NODE_PRIVATE(node); priv2->so = so; priv2->flags |= KSF_CLONED | KSF_EMBRYONIC; /* * Insert the cloned node into a list of embryonic children * on the parent node. When a hook is created on the cloned * node it will be removed from this list. When the parent * is destroyed it will destroy any embryonic children it has. */ LIST_INSERT_HEAD(&priv->embryos, priv2, siblings); SOCKBUF_LOCK(&so->so_rcv); soupcall_set(so, SO_RCV, ng_ksocket_incoming, node); SOCKBUF_UNLOCK(&so->so_rcv); SOCKBUF_LOCK(&so->so_snd); soupcall_set(so, SO_SND, ng_ksocket_incoming, node); SOCKBUF_UNLOCK(&so->so_snd); /* Fill in the response data and send it or return it to the caller */ resp_data = (struct ng_ksocket_accept *)resp->data; resp_data->nodeid = NG_NODE_ID(node); if (sa != NULL) bcopy(sa, &resp_data->addr, sa->sa_len); NG_SEND_MSG_ID(error, node, resp, priv->response_addr, 0); out: if (sa != NULL) free(sa, M_SONAME); + + return (0); } /* * Parse out either an integer value or an alias. */ static int ng_ksocket_parse(const struct ng_ksocket_alias *aliases, const char *s, int family) { int k, val; char *eptr; /* Try aliases */ for (k = 0; aliases[k].name != NULL; k++) { if (strcmp(s, aliases[k].name) == 0 && aliases[k].family == family) return aliases[k].value; } /* Try parsing as a number */ val = (int)strtoul(s, &eptr, 10); if (val < 0 || *eptr != '\0') return (-1); return (val); } Index: head/sys/netinet/sctp_input.c =================================================================== --- head/sys/netinet/sctp_input.c (revision 319721) +++ head/sys/netinet/sctp_input.c (revision 319722) @@ -1,6208 +1,6218 @@ /*- * Copyright (c) 2001-2008, by Cisco Systems, Inc. All rights reserved. * Copyright (c) 2008-2012, by Randall Stewart. All rights reserved. * Copyright (c) 2008-2012, by Michael Tuexen. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * a) Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * b) Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the distribution. * * c) Neither the name of Cisco Systems, Inc. nor the names of its * contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined(INET) || defined(INET6) #include #endif #include static void sctp_stop_all_cookie_timers(struct sctp_tcb *stcb) { struct sctp_nets *net; /* * This now not only stops all cookie timers it also stops any INIT * timers as well. This will make sure that the timers are stopped * in all collision cases. */ SCTP_TCB_LOCK_ASSERT(stcb); TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) { if (net->rxt_timer.type == SCTP_TIMER_TYPE_COOKIE) { sctp_timer_stop(SCTP_TIMER_TYPE_COOKIE, stcb->sctp_ep, stcb, net, SCTP_FROM_SCTP_INPUT + SCTP_LOC_1); } else if (net->rxt_timer.type == SCTP_TIMER_TYPE_INIT) { sctp_timer_stop(SCTP_TIMER_TYPE_INIT, stcb->sctp_ep, stcb, net, SCTP_FROM_SCTP_INPUT + SCTP_LOC_2); } } } /* INIT handler */ static void sctp_handle_init(struct mbuf *m, int iphlen, int offset, struct sockaddr *src, struct sockaddr *dst, struct sctphdr *sh, struct sctp_init_chunk *cp, struct sctp_inpcb *inp, struct sctp_tcb *stcb, struct sctp_nets *net, int *abort_no_unlock, uint8_t mflowtype, uint32_t mflowid, uint32_t vrf_id, uint16_t port) { struct sctp_init *init; struct mbuf *op_err; SCTPDBG(SCTP_DEBUG_INPUT2, "sctp_handle_init: handling INIT tcb:%p\n", (void *)stcb); if (stcb == NULL) { SCTP_INP_RLOCK(inp); } /* validate length */ if (ntohs(cp->ch.chunk_length) < sizeof(struct sctp_init_chunk)) { op_err = sctp_generate_cause(SCTP_CAUSE_INVALID_PARAM, ""); sctp_abort_association(inp, stcb, m, iphlen, src, dst, sh, op_err, mflowtype, mflowid, vrf_id, port); if (stcb) *abort_no_unlock = 1; goto outnow; } /* validate parameters */ init = &cp->init; if (init->initiate_tag == 0) { /* protocol error... send abort */ op_err = sctp_generate_cause(SCTP_CAUSE_INVALID_PARAM, ""); sctp_abort_association(inp, stcb, m, iphlen, src, dst, sh, op_err, mflowtype, mflowid, vrf_id, port); if (stcb) *abort_no_unlock = 1; goto outnow; } if (ntohl(init->a_rwnd) < SCTP_MIN_RWND) { /* invalid parameter... send abort */ op_err = sctp_generate_cause(SCTP_CAUSE_INVALID_PARAM, ""); sctp_abort_association(inp, stcb, m, iphlen, src, dst, sh, op_err, mflowtype, mflowid, vrf_id, port); if (stcb) *abort_no_unlock = 1; goto outnow; } if (init->num_inbound_streams == 0) { /* protocol error... send abort */ op_err = sctp_generate_cause(SCTP_CAUSE_INVALID_PARAM, ""); sctp_abort_association(inp, stcb, m, iphlen, src, dst, sh, op_err, mflowtype, mflowid, vrf_id, port); if (stcb) *abort_no_unlock = 1; goto outnow; } if (init->num_outbound_streams == 0) { /* protocol error... send abort */ op_err = sctp_generate_cause(SCTP_CAUSE_INVALID_PARAM, ""); sctp_abort_association(inp, stcb, m, iphlen, src, dst, sh, op_err, mflowtype, mflowid, vrf_id, port); if (stcb) *abort_no_unlock = 1; goto outnow; } if (sctp_validate_init_auth_params(m, offset + sizeof(*cp), offset + ntohs(cp->ch.chunk_length))) { /* auth parameter(s) error... send abort */ op_err = sctp_generate_cause(SCTP_BASE_SYSCTL(sctp_diag_info_code), "Problem with AUTH parameters"); sctp_abort_association(inp, stcb, m, iphlen, src, dst, sh, op_err, mflowtype, mflowid, vrf_id, port); if (stcb) *abort_no_unlock = 1; goto outnow; } /* We are only accepting if we have a listening socket. */ if ((stcb == NULL) && ((inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) || (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) || (!SCTP_IS_LISTENING(inp)))) { /* * FIX ME ?? What about TCP model and we have a * match/restart case? Actually no fix is needed. the lookup * will always find the existing assoc so stcb would not be * NULL. It may be questionable to do this since we COULD * just send back the INIT-ACK and hope that the app did * accept()'s by the time the COOKIE was sent. But there is * a price to pay for COOKIE generation and I don't want to * pay it on the chance that the app will actually do some * accepts(). The App just looses and should NOT be in this * state :-) */ if (SCTP_BASE_SYSCTL(sctp_blackhole) == 0) { op_err = sctp_generate_cause(SCTP_BASE_SYSCTL(sctp_diag_info_code), "No listener"); sctp_send_abort(m, iphlen, src, dst, sh, 0, op_err, mflowtype, mflowid, inp->fibnum, vrf_id, port); } goto outnow; } if ((stcb != NULL) && (SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_SHUTDOWN_ACK_SENT)) { SCTPDBG(SCTP_DEBUG_INPUT3, "sctp_handle_init: sending SHUTDOWN-ACK\n"); sctp_send_shutdown_ack(stcb, NULL); sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_CONTROL_PROC, SCTP_SO_NOT_LOCKED); } else { SCTPDBG(SCTP_DEBUG_INPUT3, "sctp_handle_init: sending INIT-ACK\n"); sctp_send_initiate_ack(inp, stcb, net, m, iphlen, offset, src, dst, sh, cp, mflowtype, mflowid, vrf_id, port, ((stcb == NULL) ? SCTP_HOLDS_LOCK : SCTP_NOT_LOCKED)); } outnow: if (stcb == NULL) { SCTP_INP_RUNLOCK(inp); } } /* * process peer "INIT/INIT-ACK" chunk returns value < 0 on error */ int sctp_is_there_unsent_data(struct sctp_tcb *stcb, int so_locked #if !defined(__APPLE__) && !defined(SCTP_SO_LOCK_TESTING) SCTP_UNUSED #endif ) { int unsent_data; unsigned int i; struct sctp_stream_queue_pending *sp; struct sctp_association *asoc; /* * This function returns if any stream has true unsent data on it. * Note that as it looks through it will clean up any places that * have old data that has been sent but left at top of stream queue. */ asoc = &stcb->asoc; unsent_data = 0; SCTP_TCB_SEND_LOCK(stcb); if (!stcb->asoc.ss_functions.sctp_ss_is_empty(stcb, asoc)) { /* Check to see if some data queued */ for (i = 0; i < stcb->asoc.streamoutcnt; i++) { /* sa_ignore FREED_MEMORY */ sp = TAILQ_FIRST(&stcb->asoc.strmout[i].outqueue); if (sp == NULL) { continue; } if ((sp->msg_is_complete) && (sp->length == 0) && (sp->sender_all_done)) { /* * We are doing differed cleanup. Last time * through when we took all the data the * sender_all_done was not set. */ if (sp->put_last_out == 0) { SCTP_PRINTF("Gak, put out entire msg with NO end!-1\n"); SCTP_PRINTF("sender_done:%d len:%d msg_comp:%d put_last_out:%d\n", sp->sender_all_done, sp->length, sp->msg_is_complete, sp->put_last_out); } atomic_subtract_int(&stcb->asoc.stream_queue_cnt, 1); TAILQ_REMOVE(&stcb->asoc.strmout[i].outqueue, sp, next); stcb->asoc.ss_functions.sctp_ss_remove_from_stream(stcb, asoc, &asoc->strmout[i], sp, 1); if (sp->net) { sctp_free_remote_addr(sp->net); sp->net = NULL; } if (sp->data) { sctp_m_freem(sp->data); sp->data = NULL; } sctp_free_a_strmoq(stcb, sp, so_locked); if (!TAILQ_EMPTY(&stcb->asoc.strmout[i].outqueue)) { unsent_data++; } } else { unsent_data++; } if (unsent_data > 0) { break; } } } SCTP_TCB_SEND_UNLOCK(stcb); return (unsent_data); } static int sctp_process_init(struct sctp_init_chunk *cp, struct sctp_tcb *stcb) { struct sctp_init *init; struct sctp_association *asoc; struct sctp_nets *lnet; unsigned int i; init = &cp->init; asoc = &stcb->asoc; /* save off parameters */ asoc->peer_vtag = ntohl(init->initiate_tag); asoc->peers_rwnd = ntohl(init->a_rwnd); /* init tsn's */ asoc->highest_tsn_inside_map = asoc->asconf_seq_in = ntohl(init->initial_tsn) - 1; if (!TAILQ_EMPTY(&asoc->nets)) { /* update any ssthresh's that may have a default */ TAILQ_FOREACH(lnet, &asoc->nets, sctp_next) { lnet->ssthresh = asoc->peers_rwnd; if (SCTP_BASE_SYSCTL(sctp_logging_level) & (SCTP_CWND_MONITOR_ENABLE | SCTP_CWND_LOGGING_ENABLE)) { sctp_log_cwnd(stcb, lnet, 0, SCTP_CWND_INITIALIZATION); } } } SCTP_TCB_SEND_LOCK(stcb); if (asoc->pre_open_streams > ntohs(init->num_inbound_streams)) { unsigned int newcnt; struct sctp_stream_out *outs; struct sctp_stream_queue_pending *sp, *nsp; struct sctp_tmit_chunk *chk, *nchk; /* abandon the upper streams */ newcnt = ntohs(init->num_inbound_streams); TAILQ_FOREACH_SAFE(chk, &asoc->send_queue, sctp_next, nchk) { if (chk->rec.data.sid >= newcnt) { TAILQ_REMOVE(&asoc->send_queue, chk, sctp_next); asoc->send_queue_cnt--; if (asoc->strmout[chk->rec.data.sid].chunks_on_queues > 0) { asoc->strmout[chk->rec.data.sid].chunks_on_queues--; #ifdef INVARIANTS } else { panic("No chunks on the queues for sid %u.", chk->rec.data.sid); #endif } if (chk->data != NULL) { sctp_free_bufspace(stcb, asoc, chk, 1); sctp_ulp_notify(SCTP_NOTIFY_UNSENT_DG_FAIL, stcb, 0, chk, SCTP_SO_NOT_LOCKED); if (chk->data) { sctp_m_freem(chk->data); chk->data = NULL; } } sctp_free_a_chunk(stcb, chk, SCTP_SO_NOT_LOCKED); /* sa_ignore FREED_MEMORY */ } } if (asoc->strmout) { for (i = newcnt; i < asoc->pre_open_streams; i++) { outs = &asoc->strmout[i]; TAILQ_FOREACH_SAFE(sp, &outs->outqueue, next, nsp) { atomic_subtract_int(&stcb->asoc.stream_queue_cnt, 1); TAILQ_REMOVE(&outs->outqueue, sp, next); stcb->asoc.ss_functions.sctp_ss_remove_from_stream(stcb, asoc, outs, sp, 1); sctp_ulp_notify(SCTP_NOTIFY_SPECIAL_SP_FAIL, stcb, 0, sp, SCTP_SO_NOT_LOCKED); if (sp->data) { sctp_m_freem(sp->data); sp->data = NULL; } if (sp->net) { sctp_free_remote_addr(sp->net); sp->net = NULL; } /* Free the chunk */ sctp_free_a_strmoq(stcb, sp, SCTP_SO_NOT_LOCKED); /* sa_ignore FREED_MEMORY */ } outs->state = SCTP_STREAM_CLOSED; } } /* cut back the count */ asoc->pre_open_streams = newcnt; } SCTP_TCB_SEND_UNLOCK(stcb); asoc->streamoutcnt = asoc->pre_open_streams; if (asoc->strmout) { for (i = 0; i < asoc->streamoutcnt; i++) { asoc->strmout[i].state = SCTP_STREAM_OPEN; } } /* EY - nr_sack: initialize highest tsn in nr_mapping_array */ asoc->highest_tsn_inside_nr_map = asoc->highest_tsn_inside_map; if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MAP_LOGGING_ENABLE) { sctp_log_map(0, 5, asoc->highest_tsn_inside_map, SCTP_MAP_SLIDE_RESULT); } /* This is the next one we expect */ asoc->str_reset_seq_in = asoc->asconf_seq_in + 1; asoc->mapping_array_base_tsn = ntohl(init->initial_tsn); asoc->tsn_last_delivered = asoc->cumulative_tsn = asoc->asconf_seq_in; asoc->advanced_peer_ack_point = asoc->last_acked_seq; /* open the requested streams */ if (asoc->strmin != NULL) { /* Free the old ones */ for (i = 0; i < asoc->streamincnt; i++) { sctp_clean_up_stream(stcb, &asoc->strmin[i].inqueue); sctp_clean_up_stream(stcb, &asoc->strmin[i].uno_inqueue); } SCTP_FREE(asoc->strmin, SCTP_M_STRMI); } if (asoc->max_inbound_streams > ntohs(init->num_outbound_streams)) { asoc->streamincnt = ntohs(init->num_outbound_streams); } else { asoc->streamincnt = asoc->max_inbound_streams; } SCTP_MALLOC(asoc->strmin, struct sctp_stream_in *, asoc->streamincnt * sizeof(struct sctp_stream_in), SCTP_M_STRMI); if (asoc->strmin == NULL) { /* we didn't get memory for the streams! */ SCTPDBG(SCTP_DEBUG_INPUT2, "process_init: couldn't get memory for the streams!\n"); return (-1); } for (i = 0; i < asoc->streamincnt; i++) { asoc->strmin[i].sid = i; asoc->strmin[i].last_mid_delivered = 0xffffffff; TAILQ_INIT(&asoc->strmin[i].inqueue); TAILQ_INIT(&asoc->strmin[i].uno_inqueue); asoc->strmin[i].pd_api_started = 0; asoc->strmin[i].delivery_started = 0; } /* * load_address_from_init will put the addresses into the * association when the COOKIE is processed or the INIT-ACK is * processed. Both types of COOKIE's existing and new call this * routine. It will remove addresses that are no longer in the * association (for the restarting case where addresses are * removed). Up front when the INIT arrives we will discard it if it * is a restart and new addresses have been added. */ /* sa_ignore MEMLEAK */ return (0); } /* * INIT-ACK message processing/consumption returns value < 0 on error */ static int sctp_process_init_ack(struct mbuf *m, int iphlen, int offset, struct sockaddr *src, struct sockaddr *dst, struct sctphdr *sh, struct sctp_init_ack_chunk *cp, struct sctp_tcb *stcb, struct sctp_nets *net, int *abort_no_unlock, uint8_t mflowtype, uint32_t mflowid, uint32_t vrf_id) { struct sctp_association *asoc; struct mbuf *op_err; int retval, abort_flag; uint32_t initack_limit; int nat_friendly = 0; /* First verify that we have no illegal param's */ abort_flag = 0; op_err = sctp_arethere_unrecognized_parameters(m, (offset + sizeof(struct sctp_init_chunk)), &abort_flag, (struct sctp_chunkhdr *)cp, &nat_friendly); if (abort_flag) { /* Send an abort and notify peer */ sctp_abort_an_association(stcb->sctp_ep, stcb, op_err, SCTP_SO_NOT_LOCKED); *abort_no_unlock = 1; return (-1); } asoc = &stcb->asoc; asoc->peer_supports_nat = (uint8_t)nat_friendly; /* process the peer's parameters in the INIT-ACK */ retval = sctp_process_init((struct sctp_init_chunk *)cp, stcb); if (retval < 0) { return (retval); } initack_limit = offset + ntohs(cp->ch.chunk_length); /* load all addresses */ if ((retval = sctp_load_addresses_from_init(stcb, m, (offset + sizeof(struct sctp_init_chunk)), initack_limit, src, dst, NULL, stcb->asoc.port))) { op_err = sctp_generate_cause(SCTP_BASE_SYSCTL(sctp_diag_info_code), "Problem with address parameters"); SCTPDBG(SCTP_DEBUG_INPUT1, "Load addresses from INIT causes an abort %d\n", retval); sctp_abort_association(stcb->sctp_ep, stcb, m, iphlen, src, dst, sh, op_err, mflowtype, mflowid, vrf_id, net->port); *abort_no_unlock = 1; return (-1); } /* if the peer doesn't support asconf, flush the asconf queue */ if (asoc->asconf_supported == 0) { struct sctp_asconf_addr *param, *nparam; TAILQ_FOREACH_SAFE(param, &asoc->asconf_queue, next, nparam) { TAILQ_REMOVE(&asoc->asconf_queue, param, next); SCTP_FREE(param, SCTP_M_ASC_ADDR); } } stcb->asoc.peer_hmac_id = sctp_negotiate_hmacid(stcb->asoc.peer_hmacs, stcb->asoc.local_hmacs); if (op_err) { sctp_queue_op_err(stcb, op_err); /* queuing will steal away the mbuf chain to the out queue */ op_err = NULL; } /* extract the cookie and queue it to "echo" it back... */ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_THRESHOLD_LOGGING) { sctp_misc_ints(SCTP_THRESHOLD_CLEAR, stcb->asoc.overall_error_count, 0, SCTP_FROM_SCTP_INPUT, __LINE__); } stcb->asoc.overall_error_count = 0; net->error_count = 0; /* * Cancel the INIT timer, We do this first before queueing the * cookie. We always cancel at the primary to assue that we are * canceling the timer started by the INIT which always goes to the * primary. */ sctp_timer_stop(SCTP_TIMER_TYPE_INIT, stcb->sctp_ep, stcb, asoc->primary_destination, SCTP_FROM_SCTP_INPUT + SCTP_LOC_3); /* calculate the RTO */ net->RTO = sctp_calculate_rto(stcb, asoc, net, &asoc->time_entered, sctp_align_safe_nocopy, SCTP_RTT_FROM_NON_DATA); retval = sctp_send_cookie_echo(m, offset, stcb, net); if (retval < 0) { /* * No cookie, we probably should send a op error. But in any * case if there is no cookie in the INIT-ACK, we can * abandon the peer, its broke. */ if (retval == -3) { uint16_t len; len = (uint16_t)(sizeof(struct sctp_error_missing_param) + sizeof(uint16_t)); /* We abort with an error of missing mandatory param */ op_err = sctp_get_mbuf_for_msg(len, 0, M_NOWAIT, 1, MT_DATA); if (op_err != NULL) { struct sctp_error_missing_param *cause; SCTP_BUF_LEN(op_err) = len; cause = mtod(op_err, struct sctp_error_missing_param *); /* Subtract the reserved param */ cause->cause.code = htons(SCTP_CAUSE_MISSING_PARAM); cause->cause.length = htons(len); cause->num_missing_params = htonl(1); cause->type[0] = htons(SCTP_STATE_COOKIE); } sctp_abort_association(stcb->sctp_ep, stcb, m, iphlen, src, dst, sh, op_err, mflowtype, mflowid, vrf_id, net->port); *abort_no_unlock = 1; } return (retval); } return (0); } static void sctp_handle_heartbeat_ack(struct sctp_heartbeat_chunk *cp, struct sctp_tcb *stcb, struct sctp_nets *net) { union sctp_sockstore store; struct sctp_nets *r_net, *f_net; struct timeval tv; int req_prim = 0; uint16_t old_error_counter; if (ntohs(cp->ch.chunk_length) != sizeof(struct sctp_heartbeat_chunk)) { /* Invalid length */ return; } memset(&store, 0, sizeof(store)); switch (cp->heartbeat.hb_info.addr_family) { #ifdef INET case AF_INET: if (cp->heartbeat.hb_info.addr_len == sizeof(struct sockaddr_in)) { store.sin.sin_family = cp->heartbeat.hb_info.addr_family; store.sin.sin_len = cp->heartbeat.hb_info.addr_len; store.sin.sin_port = stcb->rport; memcpy(&store.sin.sin_addr, cp->heartbeat.hb_info.address, sizeof(store.sin.sin_addr)); } else { return; } break; #endif #ifdef INET6 case AF_INET6: if (cp->heartbeat.hb_info.addr_len == sizeof(struct sockaddr_in6)) { store.sin6.sin6_family = cp->heartbeat.hb_info.addr_family; store.sin6.sin6_len = cp->heartbeat.hb_info.addr_len; store.sin6.sin6_port = stcb->rport; memcpy(&store.sin6.sin6_addr, cp->heartbeat.hb_info.address, sizeof(struct in6_addr)); } else { return; } break; #endif default: return; } r_net = sctp_findnet(stcb, &store.sa); if (r_net == NULL) { SCTPDBG(SCTP_DEBUG_INPUT1, "Huh? I can't find the address I sent it to, discard\n"); return; } if ((r_net && (r_net->dest_state & SCTP_ADDR_UNCONFIRMED)) && (r_net->heartbeat_random1 == cp->heartbeat.hb_info.random_value1) && (r_net->heartbeat_random2 == cp->heartbeat.hb_info.random_value2)) { /* * If the its a HB and it's random value is correct when can * confirm the destination. */ r_net->dest_state &= ~SCTP_ADDR_UNCONFIRMED; if (r_net->dest_state & SCTP_ADDR_REQ_PRIMARY) { stcb->asoc.primary_destination = r_net; r_net->dest_state &= ~SCTP_ADDR_REQ_PRIMARY; f_net = TAILQ_FIRST(&stcb->asoc.nets); if (f_net != r_net) { /* * first one on the list is NOT the primary * sctp_cmpaddr() is much more efficient if * the primary is the first on the list, * make it so. */ TAILQ_REMOVE(&stcb->asoc.nets, r_net, sctp_next); TAILQ_INSERT_HEAD(&stcb->asoc.nets, r_net, sctp_next); } req_prim = 1; } sctp_ulp_notify(SCTP_NOTIFY_INTERFACE_CONFIRMED, stcb, 0, (void *)r_net, SCTP_SO_NOT_LOCKED); sctp_timer_stop(SCTP_TIMER_TYPE_HEARTBEAT, stcb->sctp_ep, stcb, r_net, SCTP_FROM_SCTP_INPUT + SCTP_LOC_4); sctp_timer_start(SCTP_TIMER_TYPE_HEARTBEAT, stcb->sctp_ep, stcb, r_net); } old_error_counter = r_net->error_count; r_net->error_count = 0; r_net->hb_responded = 1; tv.tv_sec = cp->heartbeat.hb_info.time_value_1; tv.tv_usec = cp->heartbeat.hb_info.time_value_2; /* Now lets do a RTO with this */ r_net->RTO = sctp_calculate_rto(stcb, &stcb->asoc, r_net, &tv, sctp_align_safe_nocopy, SCTP_RTT_FROM_NON_DATA); if (!(r_net->dest_state & SCTP_ADDR_REACHABLE)) { r_net->dest_state |= SCTP_ADDR_REACHABLE; sctp_ulp_notify(SCTP_NOTIFY_INTERFACE_UP, stcb, 0, (void *)r_net, SCTP_SO_NOT_LOCKED); } if (r_net->dest_state & SCTP_ADDR_PF) { r_net->dest_state &= ~SCTP_ADDR_PF; stcb->asoc.cc_functions.sctp_cwnd_update_exit_pf(stcb, net); } if (old_error_counter > 0) { sctp_timer_stop(SCTP_TIMER_TYPE_HEARTBEAT, stcb->sctp_ep, stcb, r_net, SCTP_FROM_SCTP_INPUT + SCTP_LOC_5); sctp_timer_start(SCTP_TIMER_TYPE_HEARTBEAT, stcb->sctp_ep, stcb, r_net); } if (r_net == stcb->asoc.primary_destination) { if (stcb->asoc.alternate) { /* release the alternate, primary is good */ sctp_free_remote_addr(stcb->asoc.alternate); stcb->asoc.alternate = NULL; } } /* Mobility adaptation */ if (req_prim) { if ((sctp_is_mobility_feature_on(stcb->sctp_ep, SCTP_MOBILITY_BASE) || sctp_is_mobility_feature_on(stcb->sctp_ep, SCTP_MOBILITY_FASTHANDOFF)) && sctp_is_mobility_feature_on(stcb->sctp_ep, SCTP_MOBILITY_PRIM_DELETED)) { sctp_timer_stop(SCTP_TIMER_TYPE_PRIM_DELETED, stcb->sctp_ep, stcb, NULL, SCTP_FROM_SCTP_INPUT + SCTP_LOC_6); if (sctp_is_mobility_feature_on(stcb->sctp_ep, SCTP_MOBILITY_FASTHANDOFF)) { sctp_assoc_immediate_retrans(stcb, stcb->asoc.primary_destination); } if (sctp_is_mobility_feature_on(stcb->sctp_ep, SCTP_MOBILITY_BASE)) { sctp_move_chunks_from_net(stcb, stcb->asoc.deleted_primary); } sctp_delete_prim_timer(stcb->sctp_ep, stcb, stcb->asoc.deleted_primary); } } } static int sctp_handle_nat_colliding_state(struct sctp_tcb *stcb) { /* * return 0 means we want you to proceed with the abort non-zero * means no abort processing */ struct sctpasochead *head; if (SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_COOKIE_WAIT) { /* generate a new vtag and send init */ LIST_REMOVE(stcb, sctp_asocs); stcb->asoc.my_vtag = sctp_select_a_tag(stcb->sctp_ep, stcb->sctp_ep->sctp_lport, stcb->rport, 1); head = &SCTP_BASE_INFO(sctp_asochash)[SCTP_PCBHASH_ASOC(stcb->asoc.my_vtag, SCTP_BASE_INFO(hashasocmark))]; /* * put it in the bucket in the vtag hash of assoc's for the * system */ LIST_INSERT_HEAD(head, stcb, sctp_asocs); sctp_send_initiate(stcb->sctp_ep, stcb, SCTP_SO_NOT_LOCKED); return (1); } if (SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_COOKIE_ECHOED) { /* * treat like a case where the cookie expired i.e.: - dump * current cookie. - generate a new vtag. - resend init. */ /* generate a new vtag and send init */ LIST_REMOVE(stcb, sctp_asocs); stcb->asoc.state &= ~SCTP_STATE_COOKIE_ECHOED; stcb->asoc.state |= SCTP_STATE_COOKIE_WAIT; sctp_stop_all_cookie_timers(stcb); sctp_toss_old_cookies(stcb, &stcb->asoc); stcb->asoc.my_vtag = sctp_select_a_tag(stcb->sctp_ep, stcb->sctp_ep->sctp_lport, stcb->rport, 1); head = &SCTP_BASE_INFO(sctp_asochash)[SCTP_PCBHASH_ASOC(stcb->asoc.my_vtag, SCTP_BASE_INFO(hashasocmark))]; /* * put it in the bucket in the vtag hash of assoc's for the * system */ LIST_INSERT_HEAD(head, stcb, sctp_asocs); sctp_send_initiate(stcb->sctp_ep, stcb, SCTP_SO_NOT_LOCKED); return (1); } return (0); } static int sctp_handle_nat_missing_state(struct sctp_tcb *stcb, struct sctp_nets *net) { /* * return 0 means we want you to proceed with the abort non-zero * means no abort processing */ if (stcb->asoc.auth_supported == 0) { SCTPDBG(SCTP_DEBUG_INPUT2, "sctp_handle_nat_missing_state: Peer does not support AUTH, cannot send an asconf\n"); return (0); } sctp_asconf_send_nat_state_update(stcb, net); return (1); } static void sctp_handle_abort(struct sctp_abort_chunk *abort, struct sctp_tcb *stcb, struct sctp_nets *net) { #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) struct socket *so; #endif uint16_t len; uint16_t error; SCTPDBG(SCTP_DEBUG_INPUT2, "sctp_handle_abort: handling ABORT\n"); if (stcb == NULL) return; len = ntohs(abort->ch.chunk_length); if (len > sizeof(struct sctp_chunkhdr)) { /* * Need to check the cause codes for our two magic nat * aborts which don't kill the assoc necessarily. */ struct sctp_gen_error_cause *cause; cause = (struct sctp_gen_error_cause *)(abort + 1); error = ntohs(cause->code); if (error == SCTP_CAUSE_NAT_COLLIDING_STATE) { SCTPDBG(SCTP_DEBUG_INPUT2, "Received Colliding state abort flags:%x\n", abort->ch.chunk_flags); if (sctp_handle_nat_colliding_state(stcb)) { return; } } else if (error == SCTP_CAUSE_NAT_MISSING_STATE) { SCTPDBG(SCTP_DEBUG_INPUT2, "Received missing state abort flags:%x\n", abort->ch.chunk_flags); if (sctp_handle_nat_missing_state(stcb, net)) { return; } } } else { error = 0; } /* stop any receive timers */ sctp_timer_stop(SCTP_TIMER_TYPE_RECV, stcb->sctp_ep, stcb, net, SCTP_FROM_SCTP_INPUT + SCTP_LOC_7); /* notify user of the abort and clean up... */ sctp_abort_notification(stcb, 1, error, abort, SCTP_SO_NOT_LOCKED); /* free the tcb */ SCTP_STAT_INCR_COUNTER32(sctps_aborted); if ((SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_OPEN) || (SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_SHUTDOWN_RECEIVED)) { SCTP_STAT_DECR_GAUGE32(sctps_currestab); } #ifdef SCTP_ASOCLOG_OF_TSNS sctp_print_out_track_log(stcb); #endif #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) so = SCTP_INP_SO(stcb->sctp_ep); atomic_add_int(&stcb->asoc.refcnt, 1); SCTP_TCB_UNLOCK(stcb); SCTP_SOCKET_LOCK(so, 1); SCTP_TCB_LOCK(stcb); atomic_subtract_int(&stcb->asoc.refcnt, 1); #endif stcb->asoc.state |= SCTP_STATE_WAS_ABORTED; (void)sctp_free_assoc(stcb->sctp_ep, stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTP_INPUT + SCTP_LOC_8); #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) SCTP_SOCKET_UNLOCK(so, 1); #endif SCTPDBG(SCTP_DEBUG_INPUT2, "sctp_handle_abort: finished\n"); } static void sctp_start_net_timers(struct sctp_tcb *stcb) { uint32_t cnt_hb_sent; struct sctp_nets *net; cnt_hb_sent = 0; TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) { /* * For each network start: 1) A pmtu timer. 2) A HB timer 3) * If the dest in unconfirmed send a hb as well if under * max_hb_burst have been sent. */ sctp_timer_start(SCTP_TIMER_TYPE_PATHMTURAISE, stcb->sctp_ep, stcb, net); sctp_timer_start(SCTP_TIMER_TYPE_HEARTBEAT, stcb->sctp_ep, stcb, net); if ((net->dest_state & SCTP_ADDR_UNCONFIRMED) && (cnt_hb_sent < SCTP_BASE_SYSCTL(sctp_hb_maxburst))) { sctp_send_hb(stcb, net, SCTP_SO_NOT_LOCKED); cnt_hb_sent++; } } if (cnt_hb_sent) { sctp_chunk_output(stcb->sctp_ep, stcb, SCTP_OUTPUT_FROM_COOKIE_ACK, SCTP_SO_NOT_LOCKED); } } static void sctp_handle_shutdown(struct sctp_shutdown_chunk *cp, struct sctp_tcb *stcb, struct sctp_nets *net, int *abort_flag) { struct sctp_association *asoc; int some_on_streamwheel; int old_state; #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) struct socket *so; #endif SCTPDBG(SCTP_DEBUG_INPUT2, "sctp_handle_shutdown: handling SHUTDOWN\n"); if (stcb == NULL) return; asoc = &stcb->asoc; if ((SCTP_GET_STATE(asoc) == SCTP_STATE_COOKIE_WAIT) || (SCTP_GET_STATE(asoc) == SCTP_STATE_COOKIE_ECHOED)) { return; } if (ntohs(cp->ch.chunk_length) != sizeof(struct sctp_shutdown_chunk)) { /* Shutdown NOT the expected size */ return; } old_state = SCTP_GET_STATE(asoc); sctp_update_acked(stcb, cp, abort_flag); if (*abort_flag) { return; } if (asoc->control_pdapi) { /* * With a normal shutdown we assume the end of last record. */ SCTP_INP_READ_LOCK(stcb->sctp_ep); if (asoc->control_pdapi->on_strm_q) { struct sctp_stream_in *strm; strm = &asoc->strmin[asoc->control_pdapi->sinfo_stream]; if (asoc->control_pdapi->on_strm_q == SCTP_ON_UNORDERED) { /* Unordered */ TAILQ_REMOVE(&strm->uno_inqueue, asoc->control_pdapi, next_instrm); asoc->control_pdapi->on_strm_q = 0; } else if (asoc->control_pdapi->on_strm_q == SCTP_ON_ORDERED) { /* Ordered */ TAILQ_REMOVE(&strm->inqueue, asoc->control_pdapi, next_instrm); asoc->control_pdapi->on_strm_q = 0; #ifdef INVARIANTS } else { panic("Unknown state on ctrl:%p on_strm_q:%d", asoc->control_pdapi, asoc->control_pdapi->on_strm_q); #endif } } asoc->control_pdapi->end_added = 1; asoc->control_pdapi->pdapi_aborted = 1; asoc->control_pdapi = NULL; SCTP_INP_READ_UNLOCK(stcb->sctp_ep); #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) so = SCTP_INP_SO(stcb->sctp_ep); atomic_add_int(&stcb->asoc.refcnt, 1); SCTP_TCB_UNLOCK(stcb); SCTP_SOCKET_LOCK(so, 1); SCTP_TCB_LOCK(stcb); atomic_subtract_int(&stcb->asoc.refcnt, 1); if (stcb->asoc.state & SCTP_STATE_CLOSED_SOCKET) { /* assoc was freed while we were unlocked */ SCTP_SOCKET_UNLOCK(so, 1); return; } #endif if (stcb->sctp_socket) { sctp_sorwakeup(stcb->sctp_ep, stcb->sctp_socket); } #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) SCTP_SOCKET_UNLOCK(so, 1); #endif } /* goto SHUTDOWN_RECEIVED state to block new requests */ if (stcb->sctp_socket) { if ((SCTP_GET_STATE(asoc) != SCTP_STATE_SHUTDOWN_RECEIVED) && (SCTP_GET_STATE(asoc) != SCTP_STATE_SHUTDOWN_ACK_SENT) && (SCTP_GET_STATE(asoc) != SCTP_STATE_SHUTDOWN_SENT)) { SCTP_SET_STATE(asoc, SCTP_STATE_SHUTDOWN_RECEIVED); SCTP_CLEAR_SUBSTATE(asoc, SCTP_STATE_SHUTDOWN_PENDING); /* * notify upper layer that peer has initiated a * shutdown */ sctp_ulp_notify(SCTP_NOTIFY_PEER_SHUTDOWN, stcb, 0, NULL, SCTP_SO_NOT_LOCKED); /* reset time */ (void)SCTP_GETTIME_TIMEVAL(&asoc->time_entered); } } if (SCTP_GET_STATE(asoc) == SCTP_STATE_SHUTDOWN_SENT) { /* * stop the shutdown timer, since we WILL move to * SHUTDOWN-ACK-SENT. */ sctp_timer_stop(SCTP_TIMER_TYPE_SHUTDOWN, stcb->sctp_ep, stcb, net, SCTP_FROM_SCTP_INPUT + SCTP_LOC_9); } /* Now is there unsent data on a stream somewhere? */ some_on_streamwheel = sctp_is_there_unsent_data(stcb, SCTP_SO_NOT_LOCKED); if (!TAILQ_EMPTY(&asoc->send_queue) || !TAILQ_EMPTY(&asoc->sent_queue) || some_on_streamwheel) { /* By returning we will push more data out */ return; } else { /* no outstanding data to send, so move on... */ /* send SHUTDOWN-ACK */ /* move to SHUTDOWN-ACK-SENT state */ if ((SCTP_GET_STATE(asoc) == SCTP_STATE_OPEN) || (SCTP_GET_STATE(asoc) == SCTP_STATE_SHUTDOWN_RECEIVED)) { SCTP_STAT_DECR_GAUGE32(sctps_currestab); } SCTP_CLEAR_SUBSTATE(asoc, SCTP_STATE_SHUTDOWN_PENDING); if (SCTP_GET_STATE(asoc) != SCTP_STATE_SHUTDOWN_ACK_SENT) { SCTP_SET_STATE(asoc, SCTP_STATE_SHUTDOWN_ACK_SENT); sctp_stop_timers_for_shutdown(stcb); sctp_send_shutdown_ack(stcb, net); sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWNACK, stcb->sctp_ep, stcb, net); } else if (old_state == SCTP_STATE_SHUTDOWN_ACK_SENT) { sctp_send_shutdown_ack(stcb, net); } } } static void sctp_handle_shutdown_ack(struct sctp_shutdown_ack_chunk *cp SCTP_UNUSED, struct sctp_tcb *stcb, struct sctp_nets *net) { struct sctp_association *asoc; #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) struct socket *so; so = SCTP_INP_SO(stcb->sctp_ep); #endif SCTPDBG(SCTP_DEBUG_INPUT2, "sctp_handle_shutdown_ack: handling SHUTDOWN ACK\n"); if (stcb == NULL) return; asoc = &stcb->asoc; /* process according to association state */ if ((SCTP_GET_STATE(asoc) == SCTP_STATE_COOKIE_WAIT) || (SCTP_GET_STATE(asoc) == SCTP_STATE_COOKIE_ECHOED)) { /* unexpected SHUTDOWN-ACK... do OOTB handling... */ sctp_send_shutdown_complete(stcb, net, 1); SCTP_TCB_UNLOCK(stcb); return; } if ((SCTP_GET_STATE(asoc) != SCTP_STATE_SHUTDOWN_SENT) && (SCTP_GET_STATE(asoc) != SCTP_STATE_SHUTDOWN_ACK_SENT)) { /* unexpected SHUTDOWN-ACK... so ignore... */ SCTP_TCB_UNLOCK(stcb); return; } if (asoc->control_pdapi) { /* * With a normal shutdown we assume the end of last record. */ SCTP_INP_READ_LOCK(stcb->sctp_ep); asoc->control_pdapi->end_added = 1; asoc->control_pdapi->pdapi_aborted = 1; asoc->control_pdapi = NULL; SCTP_INP_READ_UNLOCK(stcb->sctp_ep); #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) atomic_add_int(&stcb->asoc.refcnt, 1); SCTP_TCB_UNLOCK(stcb); SCTP_SOCKET_LOCK(so, 1); SCTP_TCB_LOCK(stcb); atomic_subtract_int(&stcb->asoc.refcnt, 1); if (stcb->asoc.state & SCTP_STATE_CLOSED_SOCKET) { /* assoc was freed while we were unlocked */ SCTP_SOCKET_UNLOCK(so, 1); return; } #endif sctp_sorwakeup(stcb->sctp_ep, stcb->sctp_socket); #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) SCTP_SOCKET_UNLOCK(so, 1); #endif } #ifdef INVARIANTS if (!TAILQ_EMPTY(&asoc->send_queue) || !TAILQ_EMPTY(&asoc->sent_queue) || sctp_is_there_unsent_data(stcb, SCTP_SO_NOT_LOCKED)) { panic("Queues are not empty when handling SHUTDOWN-ACK"); } #endif /* stop the timer */ sctp_timer_stop(SCTP_TIMER_TYPE_SHUTDOWN, stcb->sctp_ep, stcb, net, SCTP_FROM_SCTP_INPUT + SCTP_LOC_10); /* send SHUTDOWN-COMPLETE */ sctp_send_shutdown_complete(stcb, net, 0); /* notify upper layer protocol */ if (stcb->sctp_socket) { if ((stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL)) { stcb->sctp_socket->so_snd.sb_cc = 0; } sctp_ulp_notify(SCTP_NOTIFY_ASSOC_DOWN, stcb, 0, NULL, SCTP_SO_NOT_LOCKED); } SCTP_STAT_INCR_COUNTER32(sctps_shutdown); /* free the TCB but first save off the ep */ #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) atomic_add_int(&stcb->asoc.refcnt, 1); SCTP_TCB_UNLOCK(stcb); SCTP_SOCKET_LOCK(so, 1); SCTP_TCB_LOCK(stcb); atomic_subtract_int(&stcb->asoc.refcnt, 1); #endif (void)sctp_free_assoc(stcb->sctp_ep, stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTP_INPUT + SCTP_LOC_11); #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) SCTP_SOCKET_UNLOCK(so, 1); #endif } /* * Skip past the param header and then we will find the chunk that caused the * problem. There are two possibilities ASCONF or FWD-TSN other than that and * our peer must be broken. */ static void sctp_process_unrecog_chunk(struct sctp_tcb *stcb, struct sctp_paramhdr *phdr, struct sctp_nets *net) { struct sctp_chunkhdr *chk; chk = (struct sctp_chunkhdr *)((caddr_t)phdr + sizeof(*phdr)); switch (chk->chunk_type) { case SCTP_ASCONF_ACK: case SCTP_ASCONF: sctp_asconf_cleanup(stcb, net); break; case SCTP_IFORWARD_CUM_TSN: case SCTP_FORWARD_CUM_TSN: stcb->asoc.prsctp_supported = 0; break; default: SCTPDBG(SCTP_DEBUG_INPUT2, "Peer does not support chunk type %d(%x)??\n", chk->chunk_type, (uint32_t)chk->chunk_type); break; } } /* * Skip past the param header and then we will find the param that caused the * problem. There are a number of param's in a ASCONF OR the prsctp param * these will turn of specific features. * XXX: Is this the right thing to do? */ static void sctp_process_unrecog_param(struct sctp_tcb *stcb, struct sctp_paramhdr *phdr) { struct sctp_paramhdr *pbad; pbad = phdr + 1; switch (ntohs(pbad->param_type)) { /* pr-sctp draft */ case SCTP_PRSCTP_SUPPORTED: stcb->asoc.prsctp_supported = 0; break; case SCTP_SUPPORTED_CHUNK_EXT: break; /* draft-ietf-tsvwg-addip-sctp */ case SCTP_HAS_NAT_SUPPORT: stcb->asoc.peer_supports_nat = 0; break; case SCTP_ADD_IP_ADDRESS: case SCTP_DEL_IP_ADDRESS: case SCTP_SET_PRIM_ADDR: stcb->asoc.asconf_supported = 0; break; case SCTP_SUCCESS_REPORT: case SCTP_ERROR_CAUSE_IND: SCTPDBG(SCTP_DEBUG_INPUT2, "Huh, the peer does not support success? or error cause?\n"); SCTPDBG(SCTP_DEBUG_INPUT2, "Turning off ASCONF to this strange peer\n"); stcb->asoc.asconf_supported = 0; break; default: SCTPDBG(SCTP_DEBUG_INPUT2, "Peer does not support param type %d(%x)??\n", pbad->param_type, (uint32_t)pbad->param_type); break; } } static int sctp_handle_error(struct sctp_chunkhdr *ch, struct sctp_tcb *stcb, struct sctp_nets *net) { int chklen; struct sctp_paramhdr *phdr; uint16_t error, error_type; uint16_t error_len; struct sctp_association *asoc; int adjust; #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) struct socket *so; #endif /* parse through all of the errors and process */ asoc = &stcb->asoc; phdr = (struct sctp_paramhdr *)((caddr_t)ch + sizeof(struct sctp_chunkhdr)); chklen = ntohs(ch->chunk_length) - sizeof(struct sctp_chunkhdr); error = 0; while ((size_t)chklen >= sizeof(struct sctp_paramhdr)) { /* Process an Error Cause */ error_type = ntohs(phdr->param_type); error_len = ntohs(phdr->param_length); if ((error_len > chklen) || (error_len == 0)) { /* invalid param length for this param */ SCTPDBG(SCTP_DEBUG_INPUT1, "Bogus length in error param- chunk left:%d errorlen:%d\n", chklen, error_len); return (0); } if (error == 0) { /* report the first error cause */ error = error_type; } switch (error_type) { case SCTP_CAUSE_INVALID_STREAM: case SCTP_CAUSE_MISSING_PARAM: case SCTP_CAUSE_INVALID_PARAM: case SCTP_CAUSE_NO_USER_DATA: SCTPDBG(SCTP_DEBUG_INPUT1, "Software error we got a %d back? We have a bug :/ (or do they?)\n", error_type); break; case SCTP_CAUSE_NAT_COLLIDING_STATE: SCTPDBG(SCTP_DEBUG_INPUT2, "Received Colliding state abort flags:%x\n", ch->chunk_flags); if (sctp_handle_nat_colliding_state(stcb)) { return (0); } break; case SCTP_CAUSE_NAT_MISSING_STATE: SCTPDBG(SCTP_DEBUG_INPUT2, "Received missing state abort flags:%x\n", ch->chunk_flags); if (sctp_handle_nat_missing_state(stcb, net)) { return (0); } break; case SCTP_CAUSE_STALE_COOKIE: /* * We only act if we have echoed a cookie and are * waiting. */ if (SCTP_GET_STATE(asoc) == SCTP_STATE_COOKIE_ECHOED) { int *p; p = (int *)((caddr_t)phdr + sizeof(*phdr)); /* Save the time doubled */ asoc->cookie_preserve_req = ntohl(*p) << 1; asoc->stale_cookie_count++; if (asoc->stale_cookie_count > asoc->max_init_times) { sctp_abort_notification(stcb, 0, 0, NULL, SCTP_SO_NOT_LOCKED); /* now free the asoc */ #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) so = SCTP_INP_SO(stcb->sctp_ep); atomic_add_int(&stcb->asoc.refcnt, 1); SCTP_TCB_UNLOCK(stcb); SCTP_SOCKET_LOCK(so, 1); SCTP_TCB_LOCK(stcb); atomic_subtract_int(&stcb->asoc.refcnt, 1); #endif (void)sctp_free_assoc(stcb->sctp_ep, stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTP_INPUT + SCTP_LOC_12); #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) SCTP_SOCKET_UNLOCK(so, 1); #endif return (-1); } /* blast back to INIT state */ sctp_toss_old_cookies(stcb, &stcb->asoc); asoc->state &= ~SCTP_STATE_COOKIE_ECHOED; asoc->state |= SCTP_STATE_COOKIE_WAIT; sctp_stop_all_cookie_timers(stcb); sctp_send_initiate(stcb->sctp_ep, stcb, SCTP_SO_NOT_LOCKED); } break; case SCTP_CAUSE_UNRESOLVABLE_ADDR: /* * Nothing we can do here, we don't do hostname * addresses so if the peer does not like my IPv6 * (or IPv4 for that matter) it does not matter. If * they don't support that type of address, they can * NOT possibly get that packet type... i.e. with no * IPv6 you can't receive a IPv6 packet. so we can * safely ignore this one. If we ever added support * for HOSTNAME Addresses, then we would need to do * something here. */ break; case SCTP_CAUSE_UNRECOG_CHUNK: sctp_process_unrecog_chunk(stcb, phdr, net); break; case SCTP_CAUSE_UNRECOG_PARAM: sctp_process_unrecog_param(stcb, phdr); break; case SCTP_CAUSE_COOKIE_IN_SHUTDOWN: /* * We ignore this since the timer will drive out a * new cookie anyway and there timer will drive us * to send a SHUTDOWN_COMPLETE. We can't send one * here since we don't have their tag. */ break; case SCTP_CAUSE_DELETING_LAST_ADDR: case SCTP_CAUSE_RESOURCE_SHORTAGE: case SCTP_CAUSE_DELETING_SRC_ADDR: /* * We should NOT get these here, but in a * ASCONF-ACK. */ SCTPDBG(SCTP_DEBUG_INPUT2, "Peer sends ASCONF errors in a Operational Error?<%d>?\n", error_type); break; case SCTP_CAUSE_OUT_OF_RESC: /* * And what, pray tell do we do with the fact that * the peer is out of resources? Not really sure we * could do anything but abort. I suspect this * should have came WITH an abort instead of in a * OP-ERROR. */ break; default: SCTPDBG(SCTP_DEBUG_INPUT1, "sctp_handle_error: unknown error type = 0x%xh\n", error_type); break; } adjust = SCTP_SIZE32(error_len); chklen -= adjust; phdr = (struct sctp_paramhdr *)((caddr_t)phdr + adjust); } sctp_ulp_notify(SCTP_NOTIFY_REMOTE_ERROR, stcb, error, ch, SCTP_SO_NOT_LOCKED); return (0); } static int sctp_handle_init_ack(struct mbuf *m, int iphlen, int offset, struct sockaddr *src, struct sockaddr *dst, struct sctphdr *sh, struct sctp_init_ack_chunk *cp, struct sctp_tcb *stcb, struct sctp_nets *net, int *abort_no_unlock, uint8_t mflowtype, uint32_t mflowid, uint32_t vrf_id) { struct sctp_init_ack *init_ack; struct mbuf *op_err; SCTPDBG(SCTP_DEBUG_INPUT2, "sctp_handle_init_ack: handling INIT-ACK\n"); if (stcb == NULL) { SCTPDBG(SCTP_DEBUG_INPUT2, "sctp_handle_init_ack: TCB is null\n"); return (-1); } if (ntohs(cp->ch.chunk_length) < sizeof(struct sctp_init_ack_chunk)) { /* Invalid length */ op_err = sctp_generate_cause(SCTP_CAUSE_INVALID_PARAM, ""); sctp_abort_association(stcb->sctp_ep, stcb, m, iphlen, src, dst, sh, op_err, mflowtype, mflowid, vrf_id, net->port); *abort_no_unlock = 1; return (-1); } init_ack = &cp->init; /* validate parameters */ if (init_ack->initiate_tag == 0) { /* protocol error... send an abort */ op_err = sctp_generate_cause(SCTP_CAUSE_INVALID_PARAM, ""); sctp_abort_association(stcb->sctp_ep, stcb, m, iphlen, src, dst, sh, op_err, mflowtype, mflowid, vrf_id, net->port); *abort_no_unlock = 1; return (-1); } if (ntohl(init_ack->a_rwnd) < SCTP_MIN_RWND) { /* protocol error... send an abort */ op_err = sctp_generate_cause(SCTP_CAUSE_INVALID_PARAM, ""); sctp_abort_association(stcb->sctp_ep, stcb, m, iphlen, src, dst, sh, op_err, mflowtype, mflowid, vrf_id, net->port); *abort_no_unlock = 1; return (-1); } if (init_ack->num_inbound_streams == 0) { /* protocol error... send an abort */ op_err = sctp_generate_cause(SCTP_CAUSE_INVALID_PARAM, ""); sctp_abort_association(stcb->sctp_ep, stcb, m, iphlen, src, dst, sh, op_err, mflowtype, mflowid, vrf_id, net->port); *abort_no_unlock = 1; return (-1); } if (init_ack->num_outbound_streams == 0) { /* protocol error... send an abort */ op_err = sctp_generate_cause(SCTP_CAUSE_INVALID_PARAM, ""); sctp_abort_association(stcb->sctp_ep, stcb, m, iphlen, src, dst, sh, op_err, mflowtype, mflowid, vrf_id, net->port); *abort_no_unlock = 1; return (-1); } /* process according to association state... */ switch (stcb->asoc.state & SCTP_STATE_MASK) { case SCTP_STATE_COOKIE_WAIT: /* this is the expected state for this chunk */ /* process the INIT-ACK parameters */ if (stcb->asoc.primary_destination->dest_state & SCTP_ADDR_UNCONFIRMED) { /* * The primary is where we sent the INIT, we can * always consider it confirmed when the INIT-ACK is * returned. Do this before we load addresses * though. */ stcb->asoc.primary_destination->dest_state &= ~SCTP_ADDR_UNCONFIRMED; sctp_ulp_notify(SCTP_NOTIFY_INTERFACE_CONFIRMED, stcb, 0, (void *)stcb->asoc.primary_destination, SCTP_SO_NOT_LOCKED); } if (sctp_process_init_ack(m, iphlen, offset, src, dst, sh, cp, stcb, net, abort_no_unlock, mflowtype, mflowid, vrf_id) < 0) { /* error in parsing parameters */ return (-1); } /* update our state */ SCTPDBG(SCTP_DEBUG_INPUT2, "moving to COOKIE-ECHOED state\n"); SCTP_SET_STATE(&stcb->asoc, SCTP_STATE_COOKIE_ECHOED); /* reset the RTO calc */ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_THRESHOLD_LOGGING) { sctp_misc_ints(SCTP_THRESHOLD_CLEAR, stcb->asoc.overall_error_count, 0, SCTP_FROM_SCTP_INPUT, __LINE__); } stcb->asoc.overall_error_count = 0; (void)SCTP_GETTIME_TIMEVAL(&stcb->asoc.time_entered); /* * collapse the init timer back in case of a exponential * backoff */ sctp_timer_start(SCTP_TIMER_TYPE_COOKIE, stcb->sctp_ep, stcb, net); /* * the send at the end of the inbound data processing will * cause the cookie to be sent */ break; case SCTP_STATE_SHUTDOWN_SENT: /* incorrect state... discard */ break; case SCTP_STATE_COOKIE_ECHOED: /* incorrect state... discard */ break; case SCTP_STATE_OPEN: /* incorrect state... discard */ break; case SCTP_STATE_EMPTY: case SCTP_STATE_INUSE: default: /* incorrect state... discard */ return (-1); break; } SCTPDBG(SCTP_DEBUG_INPUT1, "Leaving handle-init-ack end\n"); return (0); } static struct sctp_tcb * sctp_process_cookie_new(struct mbuf *m, int iphlen, int offset, struct sockaddr *src, struct sockaddr *dst, struct sctphdr *sh, struct sctp_state_cookie *cookie, int cookie_len, struct sctp_inpcb *inp, struct sctp_nets **netp, struct sockaddr *init_src, int *notification, int auth_skipped, uint32_t auth_offset, uint32_t auth_len, uint8_t mflowtype, uint32_t mflowid, uint32_t vrf_id, uint16_t port); /* * handle a state cookie for an existing association m: input packet mbuf * chain-- assumes a pullup on IP/SCTP/COOKIE-ECHO chunk note: this is a * "split" mbuf and the cookie signature does not exist offset: offset into * mbuf to the cookie-echo chunk */ static struct sctp_tcb * sctp_process_cookie_existing(struct mbuf *m, int iphlen, int offset, struct sockaddr *src, struct sockaddr *dst, struct sctphdr *sh, struct sctp_state_cookie *cookie, int cookie_len, struct sctp_inpcb *inp, struct sctp_tcb *stcb, struct sctp_nets **netp, struct sockaddr *init_src, int *notification, int auth_skipped, uint32_t auth_offset, uint32_t auth_len, uint8_t mflowtype, uint32_t mflowid, uint32_t vrf_id, uint16_t port) { struct sctp_association *asoc; struct sctp_init_chunk *init_cp, init_buf; struct sctp_init_ack_chunk *initack_cp, initack_buf; struct sctp_nets *net; struct mbuf *op_err; int init_offset, initack_offset, i; int retval; int spec_flag = 0; uint32_t how_indx; #if defined(SCTP_DETAILED_STR_STATS) int j; #endif net = *netp; /* I know that the TCB is non-NULL from the caller */ asoc = &stcb->asoc; for (how_indx = 0; how_indx < sizeof(asoc->cookie_how); how_indx++) { if (asoc->cookie_how[how_indx] == 0) break; } if (how_indx < sizeof(asoc->cookie_how)) { asoc->cookie_how[how_indx] = 1; } if (SCTP_GET_STATE(asoc) == SCTP_STATE_SHUTDOWN_ACK_SENT) { /* SHUTDOWN came in after sending INIT-ACK */ sctp_send_shutdown_ack(stcb, stcb->asoc.primary_destination); op_err = sctp_generate_cause(SCTP_CAUSE_COOKIE_IN_SHUTDOWN, ""); sctp_send_operr_to(src, dst, sh, cookie->peers_vtag, op_err, mflowtype, mflowid, inp->fibnum, vrf_id, net->port); if (how_indx < sizeof(asoc->cookie_how)) asoc->cookie_how[how_indx] = 2; return (NULL); } /* * find and validate the INIT chunk in the cookie (peer's info) the * INIT should start after the cookie-echo header struct (chunk * header, state cookie header struct) */ init_offset = offset += sizeof(struct sctp_cookie_echo_chunk); init_cp = (struct sctp_init_chunk *) sctp_m_getptr(m, init_offset, sizeof(struct sctp_init_chunk), (uint8_t *)&init_buf); if (init_cp == NULL) { /* could not pull a INIT chunk in cookie */ return (NULL); } if (init_cp->ch.chunk_type != SCTP_INITIATION) { return (NULL); } /* * find and validate the INIT-ACK chunk in the cookie (my info) the * INIT-ACK follows the INIT chunk */ initack_offset = init_offset + SCTP_SIZE32(ntohs(init_cp->ch.chunk_length)); initack_cp = (struct sctp_init_ack_chunk *) sctp_m_getptr(m, initack_offset, sizeof(struct sctp_init_ack_chunk), (uint8_t *)&initack_buf); if (initack_cp == NULL) { /* could not pull INIT-ACK chunk in cookie */ return (NULL); } if (initack_cp->ch.chunk_type != SCTP_INITIATION_ACK) { return (NULL); } if ((ntohl(initack_cp->init.initiate_tag) == asoc->my_vtag) && (ntohl(init_cp->init.initiate_tag) == asoc->peer_vtag)) { /* * case D in Section 5.2.4 Table 2: MMAA process accordingly * to get into the OPEN state */ if (ntohl(initack_cp->init.initial_tsn) != asoc->init_seq_number) { /*- * Opps, this means that we somehow generated two vtag's * the same. I.e. we did: * Us Peer * <---INIT(tag=a)------ * ----INIT-ACK(tag=t)--> * ----INIT(tag=t)------> *1 * <---INIT-ACK(tag=a)--- * <----CE(tag=t)------------- *2 * * At point *1 we should be generating a different * tag t'. Which means we would throw away the CE and send * ours instead. Basically this is case C (throw away side). */ if (how_indx < sizeof(asoc->cookie_how)) asoc->cookie_how[how_indx] = 17; return (NULL); } switch (SCTP_GET_STATE(asoc)) { case SCTP_STATE_COOKIE_WAIT: case SCTP_STATE_COOKIE_ECHOED: /* * INIT was sent but got a COOKIE_ECHO with the * correct tags... just accept it...but we must * process the init so that we can make sure we have * the right seq no's. */ /* First we must process the INIT !! */ retval = sctp_process_init(init_cp, stcb); if (retval < 0) { if (how_indx < sizeof(asoc->cookie_how)) asoc->cookie_how[how_indx] = 3; return (NULL); } /* we have already processed the INIT so no problem */ sctp_timer_stop(SCTP_TIMER_TYPE_HEARTBEAT, inp, stcb, net, SCTP_FROM_SCTP_INPUT + SCTP_LOC_13); sctp_timer_stop(SCTP_TIMER_TYPE_INIT, inp, stcb, net, SCTP_FROM_SCTP_INPUT + SCTP_LOC_14); /* update current state */ if (SCTP_GET_STATE(asoc) == SCTP_STATE_COOKIE_ECHOED) SCTP_STAT_INCR_COUNTER32(sctps_activeestab); else SCTP_STAT_INCR_COUNTER32(sctps_collisionestab); SCTP_SET_STATE(asoc, SCTP_STATE_OPEN); if (asoc->state & SCTP_STATE_SHUTDOWN_PENDING) { sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWNGUARD, stcb->sctp_ep, stcb, asoc->primary_destination); } SCTP_STAT_INCR_GAUGE32(sctps_currestab); sctp_stop_all_cookie_timers(stcb); if (((stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL)) && (!SCTP_IS_LISTENING(inp))) { #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) struct socket *so; #endif /* * Here is where collision would go if we * did a connect() and instead got a * init/init-ack/cookie done before the * init-ack came back.. */ stcb->sctp_ep->sctp_flags |= SCTP_PCB_FLAGS_CONNECTED; #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) so = SCTP_INP_SO(stcb->sctp_ep); atomic_add_int(&stcb->asoc.refcnt, 1); SCTP_TCB_UNLOCK(stcb); SCTP_SOCKET_LOCK(so, 1); SCTP_TCB_LOCK(stcb); atomic_add_int(&stcb->asoc.refcnt, -1); if (stcb->asoc.state & SCTP_STATE_CLOSED_SOCKET) { SCTP_SOCKET_UNLOCK(so, 1); return (NULL); } #endif soisconnected(stcb->sctp_socket); #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) SCTP_SOCKET_UNLOCK(so, 1); #endif } /* notify upper layer */ *notification = SCTP_NOTIFY_ASSOC_UP; /* * since we did not send a HB make sure we don't * double things */ net->hb_responded = 1; net->RTO = sctp_calculate_rto(stcb, asoc, net, &cookie->time_entered, sctp_align_unsafe_makecopy, SCTP_RTT_FROM_NON_DATA); if (stcb->asoc.sctp_autoclose_ticks && (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_AUTOCLOSE))) { sctp_timer_start(SCTP_TIMER_TYPE_AUTOCLOSE, inp, stcb, NULL); } break; default: /* * we're in the OPEN state (or beyond), so peer must * have simply lost the COOKIE-ACK */ break; } /* end switch */ sctp_stop_all_cookie_timers(stcb); /* * We ignore the return code here.. not sure if we should * somehow abort.. but we do have an existing asoc. This * really should not fail. */ if (sctp_load_addresses_from_init(stcb, m, init_offset + sizeof(struct sctp_init_chunk), initack_offset, src, dst, init_src, stcb->asoc.port)) { if (how_indx < sizeof(asoc->cookie_how)) asoc->cookie_how[how_indx] = 4; return (NULL); } /* respond with a COOKIE-ACK */ sctp_toss_old_cookies(stcb, asoc); sctp_send_cookie_ack(stcb); if (how_indx < sizeof(asoc->cookie_how)) asoc->cookie_how[how_indx] = 5; return (stcb); } if (ntohl(initack_cp->init.initiate_tag) != asoc->my_vtag && ntohl(init_cp->init.initiate_tag) == asoc->peer_vtag && cookie->tie_tag_my_vtag == 0 && cookie->tie_tag_peer_vtag == 0) { /* * case C in Section 5.2.4 Table 2: XMOO silently discard */ if (how_indx < sizeof(asoc->cookie_how)) asoc->cookie_how[how_indx] = 6; return (NULL); } /* * If nat support, and the below and stcb is established, send back * a ABORT(colliding state) if we are established. */ if ((SCTP_GET_STATE(asoc) == SCTP_STATE_OPEN) && (asoc->peer_supports_nat) && ((ntohl(initack_cp->init.initiate_tag) == asoc->my_vtag) && ((ntohl(init_cp->init.initiate_tag) != asoc->peer_vtag) || (asoc->peer_vtag == 0)))) { /* * Special case - Peer's support nat. We may have two init's * that we gave out the same tag on since one was not * established.. i.e. we get INIT from host-1 behind the nat * and we respond tag-a, we get a INIT from host-2 behind * the nat and we get tag-a again. Then we bring up host-1 * (or 2's) assoc, Then comes the cookie from hsot-2 (or 1). * Now we have colliding state. We must send an abort here * with colliding state indication. */ op_err = sctp_generate_cause(SCTP_CAUSE_NAT_COLLIDING_STATE, ""); sctp_send_abort(m, iphlen, src, dst, sh, 0, op_err, mflowtype, mflowid, inp->fibnum, vrf_id, port); return (NULL); } if ((ntohl(initack_cp->init.initiate_tag) == asoc->my_vtag) && ((ntohl(init_cp->init.initiate_tag) != asoc->peer_vtag) || (asoc->peer_vtag == 0))) { /* * case B in Section 5.2.4 Table 2: MXAA or MOAA my info * should be ok, re-accept peer info */ if (ntohl(initack_cp->init.initial_tsn) != asoc->init_seq_number) { /* * Extension of case C. If we hit this, then the * random number generator returned the same vtag * when we first sent our INIT-ACK and when we later * sent our INIT. The side with the seq numbers that * are different will be the one that normnally * would have hit case C. This in effect "extends" * our vtags in this collision case to be 64 bits. * The same collision could occur aka you get both * vtag and seq number the same twice in a row.. but * is much less likely. If it did happen then we * would proceed through and bring up the assoc.. we * may end up with the wrong stream setup however.. * which would be bad.. but there is no way to * tell.. until we send on a stream that does not * exist :-) */ if (how_indx < sizeof(asoc->cookie_how)) asoc->cookie_how[how_indx] = 7; return (NULL); } if (how_indx < sizeof(asoc->cookie_how)) asoc->cookie_how[how_indx] = 8; sctp_timer_stop(SCTP_TIMER_TYPE_HEARTBEAT, inp, stcb, net, SCTP_FROM_SCTP_INPUT + SCTP_LOC_15); sctp_stop_all_cookie_timers(stcb); /* * since we did not send a HB make sure we don't double * things */ net->hb_responded = 1; if (stcb->asoc.sctp_autoclose_ticks && sctp_is_feature_on(inp, SCTP_PCB_FLAGS_AUTOCLOSE)) { sctp_timer_start(SCTP_TIMER_TYPE_AUTOCLOSE, inp, stcb, NULL); } asoc->my_rwnd = ntohl(initack_cp->init.a_rwnd); asoc->pre_open_streams = ntohs(initack_cp->init.num_outbound_streams); if (ntohl(init_cp->init.initiate_tag) != asoc->peer_vtag) { /* * Ok the peer probably discarded our data (if we * echoed a cookie+data). So anything on the * sent_queue should be marked for retransmit, we * may not get something to kick us so it COULD * still take a timeout to move these.. but it can't * hurt to mark them. */ struct sctp_tmit_chunk *chk; TAILQ_FOREACH(chk, &stcb->asoc.sent_queue, sctp_next) { if (chk->sent < SCTP_DATAGRAM_RESEND) { chk->sent = SCTP_DATAGRAM_RESEND; sctp_flight_size_decrease(chk); sctp_total_flight_decrease(stcb, chk); sctp_ucount_incr(stcb->asoc.sent_queue_retran_cnt); spec_flag++; } } } /* process the INIT info (peer's info) */ retval = sctp_process_init(init_cp, stcb); if (retval < 0) { if (how_indx < sizeof(asoc->cookie_how)) asoc->cookie_how[how_indx] = 9; return (NULL); } if (sctp_load_addresses_from_init(stcb, m, init_offset + sizeof(struct sctp_init_chunk), initack_offset, src, dst, init_src, stcb->asoc.port)) { if (how_indx < sizeof(asoc->cookie_how)) asoc->cookie_how[how_indx] = 10; return (NULL); } if ((asoc->state & SCTP_STATE_COOKIE_WAIT) || (asoc->state & SCTP_STATE_COOKIE_ECHOED)) { *notification = SCTP_NOTIFY_ASSOC_UP; if (((stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL)) && (!SCTP_IS_LISTENING(inp))) { #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) struct socket *so; #endif stcb->sctp_ep->sctp_flags |= SCTP_PCB_FLAGS_CONNECTED; #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) so = SCTP_INP_SO(stcb->sctp_ep); atomic_add_int(&stcb->asoc.refcnt, 1); SCTP_TCB_UNLOCK(stcb); SCTP_SOCKET_LOCK(so, 1); SCTP_TCB_LOCK(stcb); atomic_add_int(&stcb->asoc.refcnt, -1); if (stcb->asoc.state & SCTP_STATE_CLOSED_SOCKET) { SCTP_SOCKET_UNLOCK(so, 1); return (NULL); } #endif soisconnected(stcb->sctp_socket); #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) SCTP_SOCKET_UNLOCK(so, 1); #endif } if (SCTP_GET_STATE(asoc) == SCTP_STATE_COOKIE_ECHOED) SCTP_STAT_INCR_COUNTER32(sctps_activeestab); else SCTP_STAT_INCR_COUNTER32(sctps_collisionestab); SCTP_STAT_INCR_GAUGE32(sctps_currestab); } else if (SCTP_GET_STATE(asoc) == SCTP_STATE_OPEN) { SCTP_STAT_INCR_COUNTER32(sctps_restartestab); } else { SCTP_STAT_INCR_COUNTER32(sctps_collisionestab); } SCTP_SET_STATE(asoc, SCTP_STATE_OPEN); if (asoc->state & SCTP_STATE_SHUTDOWN_PENDING) { sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWNGUARD, stcb->sctp_ep, stcb, asoc->primary_destination); } sctp_stop_all_cookie_timers(stcb); sctp_toss_old_cookies(stcb, asoc); sctp_send_cookie_ack(stcb); if (spec_flag) { /* * only if we have retrans set do we do this. What * this call does is get only the COOKIE-ACK out and * then when we return the normal call to * sctp_chunk_output will get the retrans out behind * this. */ sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_COOKIE_ACK, SCTP_SO_NOT_LOCKED); } if (how_indx < sizeof(asoc->cookie_how)) asoc->cookie_how[how_indx] = 11; return (stcb); } if ((ntohl(initack_cp->init.initiate_tag) != asoc->my_vtag && ntohl(init_cp->init.initiate_tag) != asoc->peer_vtag) && cookie->tie_tag_my_vtag == asoc->my_vtag_nonce && cookie->tie_tag_peer_vtag == asoc->peer_vtag_nonce && cookie->tie_tag_peer_vtag != 0) { struct sctpasochead *head; #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) struct socket *so; #endif if (asoc->peer_supports_nat) { /* * This is a gross gross hack. Just call the * cookie_new code since we are allowing a duplicate * association. I hope this works... */ return (sctp_process_cookie_new(m, iphlen, offset, src, dst, sh, cookie, cookie_len, inp, netp, init_src, notification, auth_skipped, auth_offset, auth_len, mflowtype, mflowid, vrf_id, port)); } /* * case A in Section 5.2.4 Table 2: XXMM (peer restarted) */ /* temp code */ if (how_indx < sizeof(asoc->cookie_how)) asoc->cookie_how[how_indx] = 12; sctp_timer_stop(SCTP_TIMER_TYPE_INIT, inp, stcb, net, SCTP_FROM_SCTP_INPUT + SCTP_LOC_16); sctp_timer_stop(SCTP_TIMER_TYPE_HEARTBEAT, inp, stcb, net, SCTP_FROM_SCTP_INPUT + SCTP_LOC_17); /* notify upper layer */ *notification = SCTP_NOTIFY_ASSOC_RESTART; atomic_add_int(&stcb->asoc.refcnt, 1); if ((SCTP_GET_STATE(asoc) != SCTP_STATE_OPEN) && (SCTP_GET_STATE(asoc) != SCTP_STATE_SHUTDOWN_RECEIVED) && (SCTP_GET_STATE(asoc) != SCTP_STATE_SHUTDOWN_SENT)) { SCTP_STAT_INCR_GAUGE32(sctps_currestab); } if (SCTP_GET_STATE(asoc) == SCTP_STATE_OPEN) { SCTP_STAT_INCR_GAUGE32(sctps_restartestab); } else if (SCTP_GET_STATE(asoc) != SCTP_STATE_SHUTDOWN_SENT) { SCTP_STAT_INCR_GAUGE32(sctps_collisionestab); } if (asoc->state & SCTP_STATE_SHUTDOWN_PENDING) { SCTP_SET_STATE(asoc, SCTP_STATE_OPEN); sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWNGUARD, stcb->sctp_ep, stcb, asoc->primary_destination); } else if (!(asoc->state & SCTP_STATE_SHUTDOWN_SENT)) { /* move to OPEN state, if not in SHUTDOWN_SENT */ SCTP_SET_STATE(asoc, SCTP_STATE_OPEN); } asoc->pre_open_streams = ntohs(initack_cp->init.num_outbound_streams); asoc->init_seq_number = ntohl(initack_cp->init.initial_tsn); asoc->sending_seq = asoc->asconf_seq_out = asoc->str_reset_seq_out = asoc->init_seq_number; asoc->asconf_seq_out_acked = asoc->asconf_seq_out - 1; asoc->asconf_seq_in = asoc->last_acked_seq = asoc->init_seq_number - 1; asoc->str_reset_seq_in = asoc->init_seq_number; asoc->advanced_peer_ack_point = asoc->last_acked_seq; if (asoc->mapping_array) { memset(asoc->mapping_array, 0, asoc->mapping_array_size); } if (asoc->nr_mapping_array) { memset(asoc->nr_mapping_array, 0, asoc->mapping_array_size); } SCTP_TCB_UNLOCK(stcb); #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) so = SCTP_INP_SO(stcb->sctp_ep); SCTP_SOCKET_LOCK(so, 1); #endif SCTP_INP_INFO_WLOCK(); SCTP_INP_WLOCK(stcb->sctp_ep); SCTP_TCB_LOCK(stcb); atomic_add_int(&stcb->asoc.refcnt, -1); /* send up all the data */ SCTP_TCB_SEND_LOCK(stcb); sctp_report_all_outbound(stcb, 0, 1, SCTP_SO_LOCKED); for (i = 0; i < stcb->asoc.streamoutcnt; i++) { stcb->asoc.strmout[i].chunks_on_queues = 0; #if defined(SCTP_DETAILED_STR_STATS) for (j = 0; j < SCTP_PR_SCTP_MAX + 1; j++) { asoc->strmout[i].abandoned_sent[j] = 0; asoc->strmout[i].abandoned_unsent[j] = 0; } #else asoc->strmout[i].abandoned_sent[0] = 0; asoc->strmout[i].abandoned_unsent[0] = 0; #endif stcb->asoc.strmout[i].sid = i; stcb->asoc.strmout[i].next_mid_ordered = 0; stcb->asoc.strmout[i].next_mid_unordered = 0; stcb->asoc.strmout[i].last_msg_incomplete = 0; } /* process the INIT-ACK info (my info) */ asoc->my_vtag = ntohl(initack_cp->init.initiate_tag); asoc->my_rwnd = ntohl(initack_cp->init.a_rwnd); /* pull from vtag hash */ LIST_REMOVE(stcb, sctp_asocs); /* re-insert to new vtag position */ head = &SCTP_BASE_INFO(sctp_asochash)[SCTP_PCBHASH_ASOC(stcb->asoc.my_vtag, SCTP_BASE_INFO(hashasocmark))]; /* * put it in the bucket in the vtag hash of assoc's for the * system */ LIST_INSERT_HEAD(head, stcb, sctp_asocs); SCTP_TCB_SEND_UNLOCK(stcb); SCTP_INP_WUNLOCK(stcb->sctp_ep); SCTP_INP_INFO_WUNLOCK(); #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) SCTP_SOCKET_UNLOCK(so, 1); #endif asoc->total_flight = 0; asoc->total_flight_count = 0; /* process the INIT info (peer's info) */ retval = sctp_process_init(init_cp, stcb); if (retval < 0) { if (how_indx < sizeof(asoc->cookie_how)) asoc->cookie_how[how_indx] = 13; return (NULL); } /* * since we did not send a HB make sure we don't double * things */ net->hb_responded = 1; if (sctp_load_addresses_from_init(stcb, m, init_offset + sizeof(struct sctp_init_chunk), initack_offset, src, dst, init_src, stcb->asoc.port)) { if (how_indx < sizeof(asoc->cookie_how)) asoc->cookie_how[how_indx] = 14; return (NULL); } /* respond with a COOKIE-ACK */ sctp_stop_all_cookie_timers(stcb); sctp_toss_old_cookies(stcb, asoc); sctp_send_cookie_ack(stcb); if (how_indx < sizeof(asoc->cookie_how)) asoc->cookie_how[how_indx] = 15; return (stcb); } if (how_indx < sizeof(asoc->cookie_how)) asoc->cookie_how[how_indx] = 16; /* all other cases... */ return (NULL); } /* * handle a state cookie for a new association m: input packet mbuf chain-- * assumes a pullup on IP/SCTP/COOKIE-ECHO chunk note: this is a "split" mbuf * and the cookie signature does not exist offset: offset into mbuf to the * cookie-echo chunk length: length of the cookie chunk to: where the init * was from returns a new TCB */ static struct sctp_tcb * sctp_process_cookie_new(struct mbuf *m, int iphlen, int offset, struct sockaddr *src, struct sockaddr *dst, struct sctphdr *sh, struct sctp_state_cookie *cookie, int cookie_len, struct sctp_inpcb *inp, struct sctp_nets **netp, struct sockaddr *init_src, int *notification, int auth_skipped, uint32_t auth_offset, uint32_t auth_len, uint8_t mflowtype, uint32_t mflowid, uint32_t vrf_id, uint16_t port) { struct sctp_tcb *stcb; struct sctp_init_chunk *init_cp, init_buf; struct sctp_init_ack_chunk *initack_cp, initack_buf; union sctp_sockstore store; struct sctp_association *asoc; int init_offset, initack_offset, initack_limit; int retval; int error = 0; uint8_t auth_chunk_buf[SCTP_PARAM_BUFFER_SIZE]; #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) struct socket *so; so = SCTP_INP_SO(inp); #endif /* * find and validate the INIT chunk in the cookie (peer's info) the * INIT should start after the cookie-echo header struct (chunk * header, state cookie header struct) */ init_offset = offset + sizeof(struct sctp_cookie_echo_chunk); init_cp = (struct sctp_init_chunk *) sctp_m_getptr(m, init_offset, sizeof(struct sctp_init_chunk), (uint8_t *)&init_buf); if (init_cp == NULL) { /* could not pull a INIT chunk in cookie */ SCTPDBG(SCTP_DEBUG_INPUT1, "process_cookie_new: could not pull INIT chunk hdr\n"); return (NULL); } if (init_cp->ch.chunk_type != SCTP_INITIATION) { SCTPDBG(SCTP_DEBUG_INPUT1, "HUH? process_cookie_new: could not find INIT chunk!\n"); return (NULL); } initack_offset = init_offset + SCTP_SIZE32(ntohs(init_cp->ch.chunk_length)); /* * find and validate the INIT-ACK chunk in the cookie (my info) the * INIT-ACK follows the INIT chunk */ initack_cp = (struct sctp_init_ack_chunk *) sctp_m_getptr(m, initack_offset, sizeof(struct sctp_init_ack_chunk), (uint8_t *)&initack_buf); if (initack_cp == NULL) { /* could not pull INIT-ACK chunk in cookie */ SCTPDBG(SCTP_DEBUG_INPUT1, "process_cookie_new: could not pull INIT-ACK chunk hdr\n"); return (NULL); } if (initack_cp->ch.chunk_type != SCTP_INITIATION_ACK) { return (NULL); } /* * NOTE: We can't use the INIT_ACK's chk_length to determine the * "initack_limit" value. This is because the chk_length field * includes the length of the cookie, but the cookie is omitted when * the INIT and INIT_ACK are tacked onto the cookie... */ initack_limit = offset + cookie_len; /* * now that we know the INIT/INIT-ACK are in place, create a new TCB * and popluate */ /* * Here we do a trick, we set in NULL for the proc/thread argument. * We do this since in effect we only use the p argument when the * socket is unbound and we must do an implicit bind. Since we are * getting a cookie, we cannot be unbound. */ stcb = sctp_aloc_assoc(inp, init_src, &error, ntohl(initack_cp->init.initiate_tag), vrf_id, ntohs(initack_cp->init.num_outbound_streams), port, (struct thread *)NULL ); if (stcb == NULL) { struct mbuf *op_err; /* memory problem? */ SCTPDBG(SCTP_DEBUG_INPUT1, "process_cookie_new: no room for another TCB!\n"); op_err = sctp_generate_cause(SCTP_CAUSE_OUT_OF_RESC, ""); sctp_abort_association(inp, (struct sctp_tcb *)NULL, m, iphlen, src, dst, sh, op_err, mflowtype, mflowid, vrf_id, port); return (NULL); } /* get the correct sctp_nets */ if (netp) *netp = sctp_findnet(stcb, init_src); asoc = &stcb->asoc; /* get scope variables out of cookie */ asoc->scope.ipv4_local_scope = cookie->ipv4_scope; asoc->scope.site_scope = cookie->site_scope; asoc->scope.local_scope = cookie->local_scope; asoc->scope.loopback_scope = cookie->loopback_scope; if ((asoc->scope.ipv4_addr_legal != cookie->ipv4_addr_legal) || (asoc->scope.ipv6_addr_legal != cookie->ipv6_addr_legal)) { struct mbuf *op_err; /* * Houston we have a problem. The EP changed while the * cookie was in flight. Only recourse is to abort the * association. */ atomic_add_int(&stcb->asoc.refcnt, 1); op_err = sctp_generate_cause(SCTP_CAUSE_OUT_OF_RESC, ""); sctp_abort_association(inp, (struct sctp_tcb *)NULL, m, iphlen, src, dst, sh, op_err, mflowtype, mflowid, vrf_id, port); #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) SCTP_TCB_UNLOCK(stcb); SCTP_SOCKET_LOCK(so, 1); SCTP_TCB_LOCK(stcb); #endif (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTP_INPUT + SCTP_LOC_18); #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) SCTP_SOCKET_UNLOCK(so, 1); #endif atomic_subtract_int(&stcb->asoc.refcnt, 1); return (NULL); } /* process the INIT-ACK info (my info) */ asoc->my_vtag = ntohl(initack_cp->init.initiate_tag); asoc->my_rwnd = ntohl(initack_cp->init.a_rwnd); asoc->pre_open_streams = ntohs(initack_cp->init.num_outbound_streams); asoc->init_seq_number = ntohl(initack_cp->init.initial_tsn); asoc->sending_seq = asoc->asconf_seq_out = asoc->str_reset_seq_out = asoc->init_seq_number; asoc->asconf_seq_out_acked = asoc->asconf_seq_out - 1; asoc->asconf_seq_in = asoc->last_acked_seq = asoc->init_seq_number - 1; asoc->str_reset_seq_in = asoc->init_seq_number; asoc->advanced_peer_ack_point = asoc->last_acked_seq; /* process the INIT info (peer's info) */ if (netp) retval = sctp_process_init(init_cp, stcb); else retval = 0; if (retval < 0) { atomic_add_int(&stcb->asoc.refcnt, 1); #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) SCTP_TCB_UNLOCK(stcb); SCTP_SOCKET_LOCK(so, 1); SCTP_TCB_LOCK(stcb); #endif (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTP_INPUT + SCTP_LOC_19); #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) SCTP_SOCKET_UNLOCK(so, 1); #endif atomic_subtract_int(&stcb->asoc.refcnt, 1); return (NULL); } /* load all addresses */ if (sctp_load_addresses_from_init(stcb, m, init_offset + sizeof(struct sctp_init_chunk), initack_offset, src, dst, init_src, port)) { atomic_add_int(&stcb->asoc.refcnt, 1); #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) SCTP_TCB_UNLOCK(stcb); SCTP_SOCKET_LOCK(so, 1); SCTP_TCB_LOCK(stcb); #endif (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTP_INPUT + SCTP_LOC_20); #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) SCTP_SOCKET_UNLOCK(so, 1); #endif atomic_subtract_int(&stcb->asoc.refcnt, 1); return (NULL); } /* * verify any preceding AUTH chunk that was skipped */ /* pull the local authentication parameters from the cookie/init-ack */ sctp_auth_get_cookie_params(stcb, m, initack_offset + sizeof(struct sctp_init_ack_chunk), initack_limit - (initack_offset + sizeof(struct sctp_init_ack_chunk))); if (auth_skipped) { struct sctp_auth_chunk *auth; auth = (struct sctp_auth_chunk *) sctp_m_getptr(m, auth_offset, auth_len, auth_chunk_buf); if ((auth == NULL) || sctp_handle_auth(stcb, auth, m, auth_offset)) { /* auth HMAC failed, dump the assoc and packet */ SCTPDBG(SCTP_DEBUG_AUTH1, "COOKIE-ECHO: AUTH failed\n"); atomic_add_int(&stcb->asoc.refcnt, 1); #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) SCTP_TCB_UNLOCK(stcb); SCTP_SOCKET_LOCK(so, 1); SCTP_TCB_LOCK(stcb); #endif (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTP_INPUT + SCTP_LOC_21); #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) SCTP_SOCKET_UNLOCK(so, 1); #endif atomic_subtract_int(&stcb->asoc.refcnt, 1); return (NULL); } else { /* remaining chunks checked... good to go */ stcb->asoc.authenticated = 1; } } /* update current state */ SCTPDBG(SCTP_DEBUG_INPUT2, "moving to OPEN state\n"); SCTP_SET_STATE(asoc, SCTP_STATE_OPEN); if (asoc->state & SCTP_STATE_SHUTDOWN_PENDING) { sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWNGUARD, stcb->sctp_ep, stcb, asoc->primary_destination); } sctp_stop_all_cookie_timers(stcb); SCTP_STAT_INCR_COUNTER32(sctps_passiveestab); SCTP_STAT_INCR_GAUGE32(sctps_currestab); /* * if we're doing ASCONFs, check to see if we have any new local * addresses that need to get added to the peer (eg. addresses * changed while cookie echo in flight). This needs to be done * after we go to the OPEN state to do the correct asconf * processing. else, make sure we have the correct addresses in our * lists */ /* warning, we re-use sin, sin6, sa_store here! */ /* pull in local_address (our "from" address) */ switch (cookie->laddr_type) { #ifdef INET case SCTP_IPV4_ADDRESS: /* source addr is IPv4 */ memset(&store.sin, 0, sizeof(struct sockaddr_in)); store.sin.sin_family = AF_INET; store.sin.sin_len = sizeof(struct sockaddr_in); store.sin.sin_addr.s_addr = cookie->laddress[0]; break; #endif #ifdef INET6 case SCTP_IPV6_ADDRESS: /* source addr is IPv6 */ memset(&store.sin6, 0, sizeof(struct sockaddr_in6)); store.sin6.sin6_family = AF_INET6; store.sin6.sin6_len = sizeof(struct sockaddr_in6); store.sin6.sin6_scope_id = cookie->scope_id; memcpy(&store.sin6.sin6_addr, cookie->laddress, sizeof(struct in6_addr)); break; #endif default: atomic_add_int(&stcb->asoc.refcnt, 1); #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) SCTP_TCB_UNLOCK(stcb); SCTP_SOCKET_LOCK(so, 1); SCTP_TCB_LOCK(stcb); #endif (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTP_INPUT + SCTP_LOC_22); #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) SCTP_SOCKET_UNLOCK(so, 1); #endif atomic_subtract_int(&stcb->asoc.refcnt, 1); return (NULL); } /* set up to notify upper layer */ *notification = SCTP_NOTIFY_ASSOC_UP; if (((stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL)) && (!SCTP_IS_LISTENING(inp))) { /* * This is an endpoint that called connect() how it got a * cookie that is NEW is a bit of a mystery. It must be that * the INIT was sent, but before it got there.. a complete * INIT/INIT-ACK/COOKIE arrived. But of course then it * should have went to the other code.. not here.. oh well.. * a bit of protection is worth having.. */ stcb->sctp_ep->sctp_flags |= SCTP_PCB_FLAGS_CONNECTED; #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) atomic_add_int(&stcb->asoc.refcnt, 1); SCTP_TCB_UNLOCK(stcb); SCTP_SOCKET_LOCK(so, 1); SCTP_TCB_LOCK(stcb); atomic_subtract_int(&stcb->asoc.refcnt, 1); if (stcb->asoc.state & SCTP_STATE_CLOSED_SOCKET) { SCTP_SOCKET_UNLOCK(so, 1); return (NULL); } #endif soisconnected(stcb->sctp_socket); #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) SCTP_SOCKET_UNLOCK(so, 1); #endif } else if ((stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) && (SCTP_IS_LISTENING(inp))) { /* * We don't want to do anything with this one. Since it is * the listening guy. The timer will get started for * accepted connections in the caller. */ ; } /* since we did not send a HB make sure we don't double things */ if ((netp) && (*netp)) (*netp)->hb_responded = 1; if (stcb->asoc.sctp_autoclose_ticks && sctp_is_feature_on(inp, SCTP_PCB_FLAGS_AUTOCLOSE)) { sctp_timer_start(SCTP_TIMER_TYPE_AUTOCLOSE, inp, stcb, NULL); } (void)SCTP_GETTIME_TIMEVAL(&stcb->asoc.time_entered); if ((netp != NULL) && (*netp != NULL)) { /* calculate the RTT and set the encaps port */ (*netp)->RTO = sctp_calculate_rto(stcb, asoc, *netp, &cookie->time_entered, sctp_align_unsafe_makecopy, SCTP_RTT_FROM_NON_DATA); } /* respond with a COOKIE-ACK */ sctp_send_cookie_ack(stcb); /* * check the address lists for any ASCONFs that need to be sent * AFTER the cookie-ack is sent */ sctp_check_address_list(stcb, m, initack_offset + sizeof(struct sctp_init_ack_chunk), initack_limit - (initack_offset + sizeof(struct sctp_init_ack_chunk)), &store.sa, cookie->local_scope, cookie->site_scope, cookie->ipv4_scope, cookie->loopback_scope); return (stcb); } /* * CODE LIKE THIS NEEDS TO RUN IF the peer supports the NAT extension, i.e * we NEED to make sure we are not already using the vtag. If so we * need to send back an ABORT-TRY-AGAIN-WITH-NEW-TAG No middle box bit! head = &SCTP_BASE_INFO(sctp_asochash)[SCTP_PCBHASH_ASOC(tag, SCTP_BASE_INFO(hashasocmark))]; LIST_FOREACH(stcb, head, sctp_asocs) { if ((stcb->asoc.my_vtag == tag) && (stcb->rport == rport) && (inp == stcb->sctp_ep)) { -- SEND ABORT - TRY AGAIN -- } } */ /* * handles a COOKIE-ECHO message stcb: modified to either a new or left as * existing (non-NULL) TCB */ static struct mbuf * sctp_handle_cookie_echo(struct mbuf *m, int iphlen, int offset, struct sockaddr *src, struct sockaddr *dst, struct sctphdr *sh, struct sctp_cookie_echo_chunk *cp, struct sctp_inpcb **inp_p, struct sctp_tcb **stcb, struct sctp_nets **netp, int auth_skipped, uint32_t auth_offset, uint32_t auth_len, struct sctp_tcb **locked_tcb, uint8_t mflowtype, uint32_t mflowid, uint32_t vrf_id, uint16_t port) { struct sctp_state_cookie *cookie; struct sctp_tcb *l_stcb = *stcb; struct sctp_inpcb *l_inp; struct sockaddr *to; struct sctp_pcb *ep; struct mbuf *m_sig; uint8_t calc_sig[SCTP_SIGNATURE_SIZE], tmp_sig[SCTP_SIGNATURE_SIZE]; uint8_t *sig; uint8_t cookie_ok = 0; unsigned int sig_offset, cookie_offset; unsigned int cookie_len; struct timeval now; struct timeval time_expires; int notification = 0; struct sctp_nets *netl; int had_a_existing_tcb = 0; int send_int_conf = 0; #ifdef INET struct sockaddr_in sin; #endif #ifdef INET6 struct sockaddr_in6 sin6; #endif SCTPDBG(SCTP_DEBUG_INPUT2, "sctp_handle_cookie: handling COOKIE-ECHO\n"); if (inp_p == NULL) { return (NULL); } cookie = &cp->cookie; cookie_offset = offset + sizeof(struct sctp_chunkhdr); cookie_len = ntohs(cp->ch.chunk_length); if ((cookie->peerport != sh->src_port) || (cookie->myport != sh->dest_port) || (cookie->my_vtag != sh->v_tag)) { /* * invalid ports or bad tag. Note that we always leave the * v_tag in the header in network order and when we stored * it in the my_vtag slot we also left it in network order. * This maintains the match even though it may be in the * opposite byte order of the machine :-> */ return (NULL); } if (cookie_len < sizeof(struct sctp_cookie_echo_chunk) + sizeof(struct sctp_init_chunk) + sizeof(struct sctp_init_ack_chunk) + SCTP_SIGNATURE_SIZE) { /* cookie too small */ return (NULL); } /* * split off the signature into its own mbuf (since it should not be * calculated in the sctp_hmac_m() call). */ sig_offset = offset + cookie_len - SCTP_SIGNATURE_SIZE; m_sig = m_split(m, sig_offset, M_NOWAIT); if (m_sig == NULL) { /* out of memory or ?? */ return (NULL); } #ifdef SCTP_MBUF_LOGGING if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MBUF_LOGGING_ENABLE) { sctp_log_mbc(m_sig, SCTP_MBUF_SPLIT); } #endif /* * compute the signature/digest for the cookie */ ep = &(*inp_p)->sctp_ep; l_inp = *inp_p; if (l_stcb) { SCTP_TCB_UNLOCK(l_stcb); } SCTP_INP_RLOCK(l_inp); if (l_stcb) { SCTP_TCB_LOCK(l_stcb); } /* which cookie is it? */ if ((cookie->time_entered.tv_sec < (long)ep->time_of_secret_change) && (ep->current_secret_number != ep->last_secret_number)) { /* it's the old cookie */ (void)sctp_hmac_m(SCTP_HMAC, (uint8_t *)ep->secret_key[(int)ep->last_secret_number], SCTP_SECRET_SIZE, m, cookie_offset, calc_sig, 0); } else { /* it's the current cookie */ (void)sctp_hmac_m(SCTP_HMAC, (uint8_t *)ep->secret_key[(int)ep->current_secret_number], SCTP_SECRET_SIZE, m, cookie_offset, calc_sig, 0); } /* get the signature */ SCTP_INP_RUNLOCK(l_inp); sig = (uint8_t *)sctp_m_getptr(m_sig, 0, SCTP_SIGNATURE_SIZE, (uint8_t *)&tmp_sig); if (sig == NULL) { /* couldn't find signature */ sctp_m_freem(m_sig); return (NULL); } /* compare the received digest with the computed digest */ if (memcmp(calc_sig, sig, SCTP_SIGNATURE_SIZE) != 0) { /* try the old cookie? */ if ((cookie->time_entered.tv_sec == (long)ep->time_of_secret_change) && (ep->current_secret_number != ep->last_secret_number)) { /* compute digest with old */ (void)sctp_hmac_m(SCTP_HMAC, (uint8_t *)ep->secret_key[(int)ep->last_secret_number], SCTP_SECRET_SIZE, m, cookie_offset, calc_sig, 0); /* compare */ if (memcmp(calc_sig, sig, SCTP_SIGNATURE_SIZE) == 0) cookie_ok = 1; } } else { cookie_ok = 1; } /* * Now before we continue we must reconstruct our mbuf so that * normal processing of any other chunks will work. */ { struct mbuf *m_at; m_at = m; while (SCTP_BUF_NEXT(m_at) != NULL) { m_at = SCTP_BUF_NEXT(m_at); } SCTP_BUF_NEXT(m_at) = m_sig; } if (cookie_ok == 0) { SCTPDBG(SCTP_DEBUG_INPUT2, "handle_cookie_echo: cookie signature validation failed!\n"); SCTPDBG(SCTP_DEBUG_INPUT2, "offset = %u, cookie_offset = %u, sig_offset = %u\n", (uint32_t)offset, cookie_offset, sig_offset); return (NULL); } /* * check the cookie timestamps to be sure it's not stale */ (void)SCTP_GETTIME_TIMEVAL(&now); /* Expire time is in Ticks, so we convert to seconds */ time_expires.tv_sec = cookie->time_entered.tv_sec + TICKS_TO_SEC(cookie->cookie_life); time_expires.tv_usec = cookie->time_entered.tv_usec; /* * TODO sctp_constants.h needs alternative time macros when _KERNEL * is undefined. */ if (timevalcmp(&now, &time_expires, >)) { /* cookie is stale! */ struct mbuf *op_err; struct sctp_error_stale_cookie *cause; uint32_t tim; op_err = sctp_get_mbuf_for_msg(sizeof(struct sctp_error_stale_cookie), 0, M_NOWAIT, 1, MT_DATA); if (op_err == NULL) { /* FOOBAR */ return (NULL); } /* Set the len */ SCTP_BUF_LEN(op_err) = sizeof(struct sctp_error_stale_cookie); cause = mtod(op_err, struct sctp_error_stale_cookie *); cause->cause.code = htons(SCTP_CAUSE_STALE_COOKIE); cause->cause.length = htons((sizeof(struct sctp_paramhdr) + (sizeof(uint32_t)))); /* seconds to usec */ tim = (now.tv_sec - time_expires.tv_sec) * 1000000; /* add in usec */ if (tim == 0) tim = now.tv_usec - cookie->time_entered.tv_usec; cause->stale_time = htonl(tim); sctp_send_operr_to(src, dst, sh, cookie->peers_vtag, op_err, mflowtype, mflowid, l_inp->fibnum, vrf_id, port); return (NULL); } /* * Now we must see with the lookup address if we have an existing * asoc. This will only happen if we were in the COOKIE-WAIT state * and a INIT collided with us and somewhere the peer sent the * cookie on another address besides the single address our assoc * had for him. In this case we will have one of the tie-tags set at * least AND the address field in the cookie can be used to look it * up. */ to = NULL; switch (cookie->addr_type) { #ifdef INET6 case SCTP_IPV6_ADDRESS: memset(&sin6, 0, sizeof(sin6)); sin6.sin6_family = AF_INET6; sin6.sin6_len = sizeof(sin6); sin6.sin6_port = sh->src_port; sin6.sin6_scope_id = cookie->scope_id; memcpy(&sin6.sin6_addr.s6_addr, cookie->address, sizeof(sin6.sin6_addr.s6_addr)); to = (struct sockaddr *)&sin6; break; #endif #ifdef INET case SCTP_IPV4_ADDRESS: memset(&sin, 0, sizeof(sin)); sin.sin_family = AF_INET; sin.sin_len = sizeof(sin); sin.sin_port = sh->src_port; sin.sin_addr.s_addr = cookie->address[0]; to = (struct sockaddr *)&sin; break; #endif default: /* This should not happen */ return (NULL); } if (*stcb == NULL) { /* Yep, lets check */ *stcb = sctp_findassociation_ep_addr(inp_p, to, netp, dst, NULL); if (*stcb == NULL) { /* * We should have only got back the same inp. If we * got back a different ep we have a problem. The * original findep got back l_inp and now */ if (l_inp != *inp_p) { SCTP_PRINTF("Bad problem find_ep got a diff inp then special_locate?\n"); } } else { if (*locked_tcb == NULL) { /* * In this case we found the assoc only * after we locked the create lock. This * means we are in a colliding case and we * must make sure that we unlock the tcb if * its one of the cases where we throw away * the incoming packets. */ *locked_tcb = *stcb; /* * We must also increment the inp ref count * since the ref_count flags was set when we * did not find the TCB, now we found it * which reduces the refcount.. we must * raise it back out to balance it all :-) */ SCTP_INP_INCR_REF((*stcb)->sctp_ep); if ((*stcb)->sctp_ep != l_inp) { SCTP_PRINTF("Huh? ep:%p diff then l_inp:%p?\n", (void *)(*stcb)->sctp_ep, (void *)l_inp); } } } } cookie_len -= SCTP_SIGNATURE_SIZE; if (*stcb == NULL) { /* this is the "normal" case... get a new TCB */ *stcb = sctp_process_cookie_new(m, iphlen, offset, src, dst, sh, cookie, cookie_len, *inp_p, netp, to, ¬ification, auth_skipped, auth_offset, auth_len, mflowtype, mflowid, vrf_id, port); } else { /* this is abnormal... cookie-echo on existing TCB */ had_a_existing_tcb = 1; *stcb = sctp_process_cookie_existing(m, iphlen, offset, src, dst, sh, cookie, cookie_len, *inp_p, *stcb, netp, to, ¬ification, auth_skipped, auth_offset, auth_len, mflowtype, mflowid, vrf_id, port); } if (*stcb == NULL) { /* still no TCB... must be bad cookie-echo */ return (NULL); } if (*netp != NULL) { (*netp)->flowtype = mflowtype; (*netp)->flowid = mflowid; } /* * Ok, we built an association so confirm the address we sent the * INIT-ACK to. */ netl = sctp_findnet(*stcb, to); /* * This code should in theory NOT run but */ if (netl == NULL) { /* TSNH! Huh, why do I need to add this address here? */ if (sctp_add_remote_addr(*stcb, to, NULL, port, SCTP_DONOT_SETSCOPE, SCTP_IN_COOKIE_PROC)) { return (NULL); } netl = sctp_findnet(*stcb, to); } if (netl) { if (netl->dest_state & SCTP_ADDR_UNCONFIRMED) { netl->dest_state &= ~SCTP_ADDR_UNCONFIRMED; (void)sctp_set_primary_addr((*stcb), (struct sockaddr *)NULL, netl); send_int_conf = 1; } } sctp_start_net_timers(*stcb); if ((*inp_p)->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) { if (!had_a_existing_tcb || (((*inp_p)->sctp_flags & SCTP_PCB_FLAGS_CONNECTED) == 0)) { /* * If we have a NEW cookie or the connect never * reached the connected state during collision we * must do the TCP accept thing. */ struct socket *so, *oso; struct sctp_inpcb *inp; if (notification == SCTP_NOTIFY_ASSOC_RESTART) { /* * For a restart we will keep the same * socket, no need to do anything. I THINK!! */ sctp_ulp_notify(notification, *stcb, 0, NULL, SCTP_SO_NOT_LOCKED); if (send_int_conf) { sctp_ulp_notify(SCTP_NOTIFY_INTERFACE_CONFIRMED, (*stcb), 0, (void *)netl, SCTP_SO_NOT_LOCKED); } return (m); } oso = (*inp_p)->sctp_socket; atomic_add_int(&(*stcb)->asoc.refcnt, 1); SCTP_TCB_UNLOCK((*stcb)); CURVNET_SET(oso->so_vnet); so = sonewconn(oso, 0 ); CURVNET_RESTORE(); SCTP_TCB_LOCK((*stcb)); atomic_subtract_int(&(*stcb)->asoc.refcnt, 1); if (so == NULL) { struct mbuf *op_err; #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) struct socket *pcb_so; #endif /* Too many sockets */ SCTPDBG(SCTP_DEBUG_INPUT1, "process_cookie_new: no room for another socket!\n"); op_err = sctp_generate_cause(SCTP_CAUSE_OUT_OF_RESC, ""); sctp_abort_association(*inp_p, NULL, m, iphlen, src, dst, sh, op_err, mflowtype, mflowid, vrf_id, port); #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) pcb_so = SCTP_INP_SO(*inp_p); atomic_add_int(&(*stcb)->asoc.refcnt, 1); SCTP_TCB_UNLOCK((*stcb)); SCTP_SOCKET_LOCK(pcb_so, 1); SCTP_TCB_LOCK((*stcb)); atomic_subtract_int(&(*stcb)->asoc.refcnt, 1); #endif (void)sctp_free_assoc(*inp_p, *stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTP_INPUT + SCTP_LOC_23); #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) SCTP_SOCKET_UNLOCK(pcb_so, 1); #endif return (NULL); } inp = (struct sctp_inpcb *)so->so_pcb; SCTP_INP_INCR_REF(inp); /* * We add the unbound flag here so that if we get an * soabort() before we get the move_pcb done, we * will properly cleanup. */ inp->sctp_flags = (SCTP_PCB_FLAGS_TCPTYPE | SCTP_PCB_FLAGS_CONNECTED | SCTP_PCB_FLAGS_IN_TCPPOOL | SCTP_PCB_FLAGS_UNBOUND | (SCTP_PCB_COPY_FLAGS & (*inp_p)->sctp_flags) | SCTP_PCB_FLAGS_DONT_WAKE); inp->sctp_features = (*inp_p)->sctp_features; inp->sctp_mobility_features = (*inp_p)->sctp_mobility_features; inp->sctp_socket = so; inp->sctp_frag_point = (*inp_p)->sctp_frag_point; inp->max_cwnd = (*inp_p)->max_cwnd; inp->sctp_cmt_on_off = (*inp_p)->sctp_cmt_on_off; inp->ecn_supported = (*inp_p)->ecn_supported; inp->prsctp_supported = (*inp_p)->prsctp_supported; inp->auth_supported = (*inp_p)->auth_supported; inp->asconf_supported = (*inp_p)->asconf_supported; inp->reconfig_supported = (*inp_p)->reconfig_supported; inp->nrsack_supported = (*inp_p)->nrsack_supported; inp->pktdrop_supported = (*inp_p)->pktdrop_supported; inp->partial_delivery_point = (*inp_p)->partial_delivery_point; inp->sctp_context = (*inp_p)->sctp_context; inp->local_strreset_support = (*inp_p)->local_strreset_support; inp->fibnum = (*inp_p)->fibnum; inp->inp_starting_point_for_iterator = NULL; /* * copy in the authentication parameters from the * original endpoint */ if (inp->sctp_ep.local_hmacs) sctp_free_hmaclist(inp->sctp_ep.local_hmacs); inp->sctp_ep.local_hmacs = sctp_copy_hmaclist((*inp_p)->sctp_ep.local_hmacs); if (inp->sctp_ep.local_auth_chunks) sctp_free_chunklist(inp->sctp_ep.local_auth_chunks); inp->sctp_ep.local_auth_chunks = sctp_copy_chunklist((*inp_p)->sctp_ep.local_auth_chunks); /* * Now we must move it from one hash table to * another and get the tcb in the right place. */ /* * This is where the one-2-one socket is put into * the accept state waiting for the accept! */ if (*stcb) { (*stcb)->asoc.state |= SCTP_STATE_IN_ACCEPT_QUEUE; } sctp_move_pcb_and_assoc(*inp_p, inp, *stcb); atomic_add_int(&(*stcb)->asoc.refcnt, 1); SCTP_TCB_UNLOCK((*stcb)); sctp_pull_off_control_to_new_inp((*inp_p), inp, *stcb, 0); SCTP_TCB_LOCK((*stcb)); atomic_subtract_int(&(*stcb)->asoc.refcnt, 1); /* * now we must check to see if we were aborted while * the move was going on and the lock/unlock * happened. */ if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) { /* * yep it was, we leave the assoc attached * to the socket since the sctp_inpcb_free() * call will send an abort for us. */ SCTP_INP_DECR_REF(inp); return (NULL); } SCTP_INP_DECR_REF(inp); /* Switch over to the new guy */ *inp_p = inp; sctp_ulp_notify(notification, *stcb, 0, NULL, SCTP_SO_NOT_LOCKED); if (send_int_conf) { sctp_ulp_notify(SCTP_NOTIFY_INTERFACE_CONFIRMED, (*stcb), 0, (void *)netl, SCTP_SO_NOT_LOCKED); } /* * Pull it from the incomplete queue and wake the * guy */ #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) atomic_add_int(&(*stcb)->asoc.refcnt, 1); SCTP_TCB_UNLOCK((*stcb)); SCTP_SOCKET_LOCK(so, 1); #endif soisconnected(so); #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) SCTP_TCB_LOCK((*stcb)); atomic_subtract_int(&(*stcb)->asoc.refcnt, 1); SCTP_SOCKET_UNLOCK(so, 1); #endif return (m); } } if (notification) { sctp_ulp_notify(notification, *stcb, 0, NULL, SCTP_SO_NOT_LOCKED); } if (send_int_conf) { sctp_ulp_notify(SCTP_NOTIFY_INTERFACE_CONFIRMED, (*stcb), 0, (void *)netl, SCTP_SO_NOT_LOCKED); } return (m); } static void sctp_handle_cookie_ack(struct sctp_cookie_ack_chunk *cp SCTP_UNUSED, struct sctp_tcb *stcb, struct sctp_nets *net) { /* cp must not be used, others call this without a c-ack :-) */ struct sctp_association *asoc; SCTPDBG(SCTP_DEBUG_INPUT2, "sctp_handle_cookie_ack: handling COOKIE-ACK\n"); if ((stcb == NULL) || (net == NULL)) { return; } asoc = &stcb->asoc; sctp_stop_all_cookie_timers(stcb); /* process according to association state */ if (SCTP_GET_STATE(asoc) == SCTP_STATE_COOKIE_ECHOED) { /* state change only needed when I am in right state */ SCTPDBG(SCTP_DEBUG_INPUT2, "moving to OPEN state\n"); SCTP_SET_STATE(asoc, SCTP_STATE_OPEN); sctp_start_net_timers(stcb); if (asoc->state & SCTP_STATE_SHUTDOWN_PENDING) { sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWNGUARD, stcb->sctp_ep, stcb, asoc->primary_destination); } /* update RTO */ SCTP_STAT_INCR_COUNTER32(sctps_activeestab); SCTP_STAT_INCR_GAUGE32(sctps_currestab); if (asoc->overall_error_count == 0) { net->RTO = sctp_calculate_rto(stcb, asoc, net, &asoc->time_entered, sctp_align_safe_nocopy, SCTP_RTT_FROM_NON_DATA); } (void)SCTP_GETTIME_TIMEVAL(&asoc->time_entered); sctp_ulp_notify(SCTP_NOTIFY_ASSOC_UP, stcb, 0, NULL, SCTP_SO_NOT_LOCKED); if ((stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL)) { #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) struct socket *so; #endif stcb->sctp_ep->sctp_flags |= SCTP_PCB_FLAGS_CONNECTED; #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) so = SCTP_INP_SO(stcb->sctp_ep); atomic_add_int(&stcb->asoc.refcnt, 1); SCTP_TCB_UNLOCK(stcb); SCTP_SOCKET_LOCK(so, 1); SCTP_TCB_LOCK(stcb); atomic_subtract_int(&stcb->asoc.refcnt, 1); #endif if ((stcb->asoc.state & SCTP_STATE_CLOSED_SOCKET) == 0) { soisconnected(stcb->sctp_socket); } #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) SCTP_SOCKET_UNLOCK(so, 1); #endif } /* * since we did not send a HB make sure we don't double * things */ net->hb_responded = 1; if (stcb->asoc.state & SCTP_STATE_CLOSED_SOCKET) { /* * We don't need to do the asconf thing, nor hb or * autoclose if the socket is closed. */ goto closed_socket; } sctp_timer_start(SCTP_TIMER_TYPE_HEARTBEAT, stcb->sctp_ep, stcb, net); if (stcb->asoc.sctp_autoclose_ticks && sctp_is_feature_on(stcb->sctp_ep, SCTP_PCB_FLAGS_AUTOCLOSE)) { sctp_timer_start(SCTP_TIMER_TYPE_AUTOCLOSE, stcb->sctp_ep, stcb, NULL); } /* * send ASCONF if parameters are pending and ASCONFs are * allowed (eg. addresses changed when init/cookie echo were * in flight) */ if ((sctp_is_feature_on(stcb->sctp_ep, SCTP_PCB_FLAGS_DO_ASCONF)) && (stcb->asoc.asconf_supported == 1) && (!TAILQ_EMPTY(&stcb->asoc.asconf_queue))) { #ifdef SCTP_TIMER_BASED_ASCONF sctp_timer_start(SCTP_TIMER_TYPE_ASCONF, stcb->sctp_ep, stcb, stcb->asoc.primary_destination); #else sctp_send_asconf(stcb, stcb->asoc.primary_destination, SCTP_ADDR_NOT_LOCKED); #endif } } closed_socket: /* Toss the cookie if I can */ sctp_toss_old_cookies(stcb, asoc); if (!TAILQ_EMPTY(&asoc->sent_queue)) { /* Restart the timer if we have pending data */ struct sctp_tmit_chunk *chk; chk = TAILQ_FIRST(&asoc->sent_queue); sctp_timer_start(SCTP_TIMER_TYPE_SEND, stcb->sctp_ep, stcb, chk->whoTo); } } static void sctp_handle_ecn_echo(struct sctp_ecne_chunk *cp, struct sctp_tcb *stcb) { struct sctp_nets *net; struct sctp_tmit_chunk *lchk; struct sctp_ecne_chunk bkup; uint8_t override_bit; uint32_t tsn, window_data_tsn; int len; unsigned int pkt_cnt; len = ntohs(cp->ch.chunk_length); if ((len != sizeof(struct sctp_ecne_chunk)) && (len != sizeof(struct old_sctp_ecne_chunk))) { return; } if (len == sizeof(struct old_sctp_ecne_chunk)) { /* Its the old format */ memcpy(&bkup, cp, sizeof(struct old_sctp_ecne_chunk)); bkup.num_pkts_since_cwr = htonl(1); cp = &bkup; } SCTP_STAT_INCR(sctps_recvecne); tsn = ntohl(cp->tsn); pkt_cnt = ntohl(cp->num_pkts_since_cwr); lchk = TAILQ_LAST(&stcb->asoc.send_queue, sctpchunk_listhead); if (lchk == NULL) { window_data_tsn = stcb->asoc.sending_seq - 1; } else { window_data_tsn = lchk->rec.data.tsn; } /* Find where it was sent to if possible. */ net = NULL; TAILQ_FOREACH(lchk, &stcb->asoc.sent_queue, sctp_next) { if (lchk->rec.data.tsn == tsn) { net = lchk->whoTo; net->ecn_prev_cwnd = lchk->rec.data.cwnd_at_send; break; } if (SCTP_TSN_GT(lchk->rec.data.tsn, tsn)) { break; } } if (net == NULL) { /* * What to do. A previous send of a CWR was possibly lost. * See how old it is, we may have it marked on the actual * net. */ TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) { if (tsn == net->last_cwr_tsn) { /* Found him, send it off */ break; } } if (net == NULL) { /* * If we reach here, we need to send a special CWR * that says hey, we did this a long time ago and * you lost the response. */ net = TAILQ_FIRST(&stcb->asoc.nets); if (net == NULL) { /* TSNH */ return; } override_bit = SCTP_CWR_REDUCE_OVERRIDE; } else { override_bit = 0; } } else { override_bit = 0; } if (SCTP_TSN_GT(tsn, net->cwr_window_tsn) && ((override_bit & SCTP_CWR_REDUCE_OVERRIDE) == 0)) { /* * JRS - Use the congestion control given in the pluggable * CC module */ stcb->asoc.cc_functions.sctp_cwnd_update_after_ecn_echo(stcb, net, 0, pkt_cnt); /* * We reduce once every RTT. So we will only lower cwnd at * the next sending seq i.e. the window_data_tsn */ net->cwr_window_tsn = window_data_tsn; net->ecn_ce_pkt_cnt += pkt_cnt; net->lost_cnt = pkt_cnt; net->last_cwr_tsn = tsn; } else { override_bit |= SCTP_CWR_IN_SAME_WINDOW; if (SCTP_TSN_GT(tsn, net->last_cwr_tsn) && ((override_bit & SCTP_CWR_REDUCE_OVERRIDE) == 0)) { /* * Another loss in the same window update how many * marks/packets lost we have had. */ int cnt = 1; if (pkt_cnt > net->lost_cnt) { /* Should be the case */ cnt = (pkt_cnt - net->lost_cnt); net->ecn_ce_pkt_cnt += cnt; } net->lost_cnt = pkt_cnt; net->last_cwr_tsn = tsn; /* * Most CC functions will ignore this call, since we * are in-window yet of the initial CE the peer saw. */ stcb->asoc.cc_functions.sctp_cwnd_update_after_ecn_echo(stcb, net, 1, cnt); } } /* * We always send a CWR this way if our previous one was lost our * peer will get an update, or if it is not time again to reduce we * still get the cwr to the peer. Note we set the override when we * could not find the TSN on the chunk or the destination network. */ sctp_send_cwr(stcb, net, net->last_cwr_tsn, override_bit); } static void sctp_handle_ecn_cwr(struct sctp_cwr_chunk *cp, struct sctp_tcb *stcb, struct sctp_nets *net) { /* * Here we get a CWR from the peer. We must look in the outqueue and * make sure that we have a covered ECNE in the control chunk part. * If so remove it. */ struct sctp_tmit_chunk *chk; struct sctp_ecne_chunk *ecne; int override; uint32_t cwr_tsn; cwr_tsn = ntohl(cp->tsn); override = cp->ch.chunk_flags & SCTP_CWR_REDUCE_OVERRIDE; TAILQ_FOREACH(chk, &stcb->asoc.control_send_queue, sctp_next) { if (chk->rec.chunk_id.id != SCTP_ECN_ECHO) { continue; } if ((override == 0) && (chk->whoTo != net)) { /* Must be from the right src unless override is set */ continue; } ecne = mtod(chk->data, struct sctp_ecne_chunk *); if (SCTP_TSN_GE(cwr_tsn, ntohl(ecne->tsn))) { /* this covers this ECNE, we can remove it */ stcb->asoc.ecn_echo_cnt_onq--; TAILQ_REMOVE(&stcb->asoc.control_send_queue, chk, sctp_next); sctp_m_freem(chk->data); chk->data = NULL; stcb->asoc.ctrl_queue_cnt--; sctp_free_a_chunk(stcb, chk, SCTP_SO_NOT_LOCKED); if (override == 0) { break; } } } } static void sctp_handle_shutdown_complete(struct sctp_shutdown_complete_chunk *cp SCTP_UNUSED, struct sctp_tcb *stcb, struct sctp_nets *net) { struct sctp_association *asoc; #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) struct socket *so; #endif SCTPDBG(SCTP_DEBUG_INPUT2, "sctp_handle_shutdown_complete: handling SHUTDOWN-COMPLETE\n"); if (stcb == NULL) return; asoc = &stcb->asoc; /* process according to association state */ if (SCTP_GET_STATE(asoc) != SCTP_STATE_SHUTDOWN_ACK_SENT) { /* unexpected SHUTDOWN-COMPLETE... so ignore... */ SCTPDBG(SCTP_DEBUG_INPUT2, "sctp_handle_shutdown_complete: not in SCTP_STATE_SHUTDOWN_ACK_SENT --- ignore\n"); SCTP_TCB_UNLOCK(stcb); return; } /* notify upper layer protocol */ if (stcb->sctp_socket) { sctp_ulp_notify(SCTP_NOTIFY_ASSOC_DOWN, stcb, 0, NULL, SCTP_SO_NOT_LOCKED); } #ifdef INVARIANTS if (!TAILQ_EMPTY(&asoc->send_queue) || !TAILQ_EMPTY(&asoc->sent_queue) || sctp_is_there_unsent_data(stcb, SCTP_SO_NOT_LOCKED)) { panic("Queues are not empty when handling SHUTDOWN-COMPLETE"); } #endif /* stop the timer */ sctp_timer_stop(SCTP_TIMER_TYPE_SHUTDOWNACK, stcb->sctp_ep, stcb, net, SCTP_FROM_SCTP_INPUT + SCTP_LOC_24); SCTP_STAT_INCR_COUNTER32(sctps_shutdown); /* free the TCB */ SCTPDBG(SCTP_DEBUG_INPUT2, "sctp_handle_shutdown_complete: calls free-asoc\n"); #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) so = SCTP_INP_SO(stcb->sctp_ep); atomic_add_int(&stcb->asoc.refcnt, 1); SCTP_TCB_UNLOCK(stcb); SCTP_SOCKET_LOCK(so, 1); SCTP_TCB_LOCK(stcb); atomic_subtract_int(&stcb->asoc.refcnt, 1); #endif (void)sctp_free_assoc(stcb->sctp_ep, stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTP_INPUT + SCTP_LOC_25); #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) SCTP_SOCKET_UNLOCK(so, 1); #endif return; } static int process_chunk_drop(struct sctp_tcb *stcb, struct sctp_chunk_desc *desc, struct sctp_nets *net, uint8_t flg) { switch (desc->chunk_type) { case SCTP_DATA: /* find the tsn to resend (possibly */ { uint32_t tsn; struct sctp_tmit_chunk *tp1; tsn = ntohl(desc->tsn_ifany); TAILQ_FOREACH(tp1, &stcb->asoc.sent_queue, sctp_next) { if (tp1->rec.data.tsn == tsn) { /* found it */ break; } if (SCTP_TSN_GT(tp1->rec.data.tsn, tsn)) { /* not found */ tp1 = NULL; break; } } if (tp1 == NULL) { /* * Do it the other way , aka without paying * attention to queue seq order. */ SCTP_STAT_INCR(sctps_pdrpdnfnd); TAILQ_FOREACH(tp1, &stcb->asoc.sent_queue, sctp_next) { if (tp1->rec.data.tsn == tsn) { /* found it */ break; } } } if (tp1 == NULL) { SCTP_STAT_INCR(sctps_pdrptsnnf); } if ((tp1) && (tp1->sent < SCTP_DATAGRAM_ACKED)) { uint8_t *ddp; if (((flg & SCTP_BADCRC) == 0) && ((flg & SCTP_FROM_MIDDLE_BOX) == 0)) { return (0); } if ((stcb->asoc.peers_rwnd == 0) && ((flg & SCTP_FROM_MIDDLE_BOX) == 0)) { SCTP_STAT_INCR(sctps_pdrpdiwnp); return (0); } if (stcb->asoc.peers_rwnd == 0 && (flg & SCTP_FROM_MIDDLE_BOX)) { SCTP_STAT_INCR(sctps_pdrpdizrw); return (0); } ddp = (uint8_t *)(mtod(tp1->data, caddr_t)+ sizeof(struct sctp_data_chunk)); { unsigned int iii; for (iii = 0; iii < sizeof(desc->data_bytes); iii++) { if (ddp[iii] != desc->data_bytes[iii]) { SCTP_STAT_INCR(sctps_pdrpbadd); return (-1); } } } if (tp1->do_rtt) { /* * this guy had a RTO calculation * pending on it, cancel it */ if (tp1->whoTo->rto_needed == 0) { tp1->whoTo->rto_needed = 1; } tp1->do_rtt = 0; } SCTP_STAT_INCR(sctps_pdrpmark); if (tp1->sent != SCTP_DATAGRAM_RESEND) sctp_ucount_incr(stcb->asoc.sent_queue_retran_cnt); /* * mark it as if we were doing a FR, since * we will be getting gap ack reports behind * the info from the router. */ tp1->rec.data.doing_fast_retransmit = 1; /* * mark the tsn with what sequences can * cause a new FR. */ if (TAILQ_EMPTY(&stcb->asoc.send_queue)) { tp1->rec.data.fast_retran_tsn = stcb->asoc.sending_seq; } else { tp1->rec.data.fast_retran_tsn = (TAILQ_FIRST(&stcb->asoc.send_queue))->rec.data.tsn; } /* restart the timer */ sctp_timer_stop(SCTP_TIMER_TYPE_SEND, stcb->sctp_ep, stcb, tp1->whoTo, SCTP_FROM_SCTP_INPUT + SCTP_LOC_26); sctp_timer_start(SCTP_TIMER_TYPE_SEND, stcb->sctp_ep, stcb, tp1->whoTo); /* fix counts and things */ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_FLIGHT_LOGGING_ENABLE) { sctp_misc_ints(SCTP_FLIGHT_LOG_DOWN_PDRP, tp1->whoTo->flight_size, tp1->book_size, (uint32_t)(uintptr_t)stcb, tp1->rec.data.tsn); } if (tp1->sent < SCTP_DATAGRAM_RESEND) { sctp_flight_size_decrease(tp1); sctp_total_flight_decrease(stcb, tp1); } tp1->sent = SCTP_DATAGRAM_RESEND; } { /* audit code */ unsigned int audit; audit = 0; TAILQ_FOREACH(tp1, &stcb->asoc.sent_queue, sctp_next) { if (tp1->sent == SCTP_DATAGRAM_RESEND) audit++; } TAILQ_FOREACH(tp1, &stcb->asoc.control_send_queue, sctp_next) { if (tp1->sent == SCTP_DATAGRAM_RESEND) audit++; } if (audit != stcb->asoc.sent_queue_retran_cnt) { SCTP_PRINTF("**Local Audit finds cnt:%d asoc cnt:%d\n", audit, stcb->asoc.sent_queue_retran_cnt); #ifndef SCTP_AUDITING_ENABLED stcb->asoc.sent_queue_retran_cnt = audit; #endif } } } break; case SCTP_ASCONF: { struct sctp_tmit_chunk *asconf; TAILQ_FOREACH(asconf, &stcb->asoc.control_send_queue, sctp_next) { if (asconf->rec.chunk_id.id == SCTP_ASCONF) { break; } } if (asconf) { if (asconf->sent != SCTP_DATAGRAM_RESEND) sctp_ucount_incr(stcb->asoc.sent_queue_retran_cnt); asconf->sent = SCTP_DATAGRAM_RESEND; asconf->snd_count--; } } break; case SCTP_INITIATION: /* resend the INIT */ stcb->asoc.dropped_special_cnt++; if (stcb->asoc.dropped_special_cnt < SCTP_RETRY_DROPPED_THRESH) { /* * If we can get it in, in a few attempts we do * this, otherwise we let the timer fire. */ sctp_timer_stop(SCTP_TIMER_TYPE_INIT, stcb->sctp_ep, stcb, net, SCTP_FROM_SCTP_INPUT + SCTP_LOC_27); sctp_send_initiate(stcb->sctp_ep, stcb, SCTP_SO_NOT_LOCKED); } break; case SCTP_SELECTIVE_ACK: case SCTP_NR_SELECTIVE_ACK: /* resend the sack */ sctp_send_sack(stcb, SCTP_SO_NOT_LOCKED); break; case SCTP_HEARTBEAT_REQUEST: /* resend a demand HB */ if ((stcb->asoc.overall_error_count + 3) < stcb->asoc.max_send_times) { /* * Only retransmit if we KNOW we wont destroy the * tcb */ sctp_send_hb(stcb, net, SCTP_SO_NOT_LOCKED); } break; case SCTP_SHUTDOWN: sctp_send_shutdown(stcb, net); break; case SCTP_SHUTDOWN_ACK: sctp_send_shutdown_ack(stcb, net); break; case SCTP_COOKIE_ECHO: { struct sctp_tmit_chunk *cookie; cookie = NULL; TAILQ_FOREACH(cookie, &stcb->asoc.control_send_queue, sctp_next) { if (cookie->rec.chunk_id.id == SCTP_COOKIE_ECHO) { break; } } if (cookie) { if (cookie->sent != SCTP_DATAGRAM_RESEND) sctp_ucount_incr(stcb->asoc.sent_queue_retran_cnt); cookie->sent = SCTP_DATAGRAM_RESEND; sctp_stop_all_cookie_timers(stcb); } } break; case SCTP_COOKIE_ACK: sctp_send_cookie_ack(stcb); break; case SCTP_ASCONF_ACK: /* resend last asconf ack */ sctp_send_asconf_ack(stcb); break; case SCTP_IFORWARD_CUM_TSN: case SCTP_FORWARD_CUM_TSN: send_forward_tsn(stcb, &stcb->asoc); break; /* can't do anything with these */ case SCTP_PACKET_DROPPED: case SCTP_INITIATION_ACK: /* this should not happen */ case SCTP_HEARTBEAT_ACK: case SCTP_ABORT_ASSOCIATION: case SCTP_OPERATION_ERROR: case SCTP_SHUTDOWN_COMPLETE: case SCTP_ECN_ECHO: case SCTP_ECN_CWR: default: break; } return (0); } void sctp_reset_in_stream(struct sctp_tcb *stcb, uint32_t number_entries, uint16_t *list) { uint32_t i; uint16_t temp; /* * We set things to 0xffffffff since this is the last delivered * sequence and we will be sending in 0 after the reset. */ if (number_entries) { for (i = 0; i < number_entries; i++) { temp = ntohs(list[i]); if (temp >= stcb->asoc.streamincnt) { continue; } stcb->asoc.strmin[temp].last_mid_delivered = 0xffffffff; } } else { list = NULL; for (i = 0; i < stcb->asoc.streamincnt; i++) { stcb->asoc.strmin[i].last_mid_delivered = 0xffffffff; } } sctp_ulp_notify(SCTP_NOTIFY_STR_RESET_RECV, stcb, number_entries, (void *)list, SCTP_SO_NOT_LOCKED); } static void sctp_reset_out_streams(struct sctp_tcb *stcb, uint32_t number_entries, uint16_t *list) { uint32_t i; uint16_t temp; if (number_entries > 0) { for (i = 0; i < number_entries; i++) { temp = ntohs(list[i]); if (temp >= stcb->asoc.streamoutcnt) { /* no such stream */ continue; } stcb->asoc.strmout[temp].next_mid_ordered = 0; stcb->asoc.strmout[temp].next_mid_unordered = 0; } } else { for (i = 0; i < stcb->asoc.streamoutcnt; i++) { stcb->asoc.strmout[i].next_mid_ordered = 0; stcb->asoc.strmout[i].next_mid_unordered = 0; } } sctp_ulp_notify(SCTP_NOTIFY_STR_RESET_SEND, stcb, number_entries, (void *)list, SCTP_SO_NOT_LOCKED); } static void sctp_reset_clear_pending(struct sctp_tcb *stcb, uint32_t number_entries, uint16_t *list) { uint32_t i; uint16_t temp; if (number_entries > 0) { for (i = 0; i < number_entries; i++) { temp = ntohs(list[i]); if (temp >= stcb->asoc.streamoutcnt) { /* no such stream */ continue; } stcb->asoc.strmout[temp].state = SCTP_STREAM_OPEN; } } else { for (i = 0; i < stcb->asoc.streamoutcnt; i++) { stcb->asoc.strmout[i].state = SCTP_STREAM_OPEN; } } } struct sctp_stream_reset_request * sctp_find_stream_reset(struct sctp_tcb *stcb, uint32_t seq, struct sctp_tmit_chunk **bchk) { struct sctp_association *asoc; struct sctp_chunkhdr *ch; struct sctp_stream_reset_request *r; struct sctp_tmit_chunk *chk; int len, clen; asoc = &stcb->asoc; if (TAILQ_EMPTY(&stcb->asoc.control_send_queue)) { asoc->stream_reset_outstanding = 0; return (NULL); } if (stcb->asoc.str_reset == NULL) { asoc->stream_reset_outstanding = 0; return (NULL); } chk = stcb->asoc.str_reset; if (chk->data == NULL) { return (NULL); } if (bchk) { /* he wants a copy of the chk pointer */ *bchk = chk; } clen = chk->send_size; ch = mtod(chk->data, struct sctp_chunkhdr *); r = (struct sctp_stream_reset_request *)(ch + 1); if (ntohl(r->request_seq) == seq) { /* found it */ return (r); } len = SCTP_SIZE32(ntohs(r->ph.param_length)); if (clen > (len + (int)sizeof(struct sctp_chunkhdr))) { /* move to the next one, there can only be a max of two */ r = (struct sctp_stream_reset_request *)((caddr_t)r + len); if (ntohl(r->request_seq) == seq) { return (r); } } /* that seq is not here */ return (NULL); } static void sctp_clean_up_stream_reset(struct sctp_tcb *stcb) { struct sctp_association *asoc; struct sctp_tmit_chunk *chk = stcb->asoc.str_reset; if (stcb->asoc.str_reset == NULL) { return; } asoc = &stcb->asoc; sctp_timer_stop(SCTP_TIMER_TYPE_STRRESET, stcb->sctp_ep, stcb, chk->whoTo, SCTP_FROM_SCTP_INPUT + SCTP_LOC_28); TAILQ_REMOVE(&asoc->control_send_queue, chk, sctp_next); if (chk->data) { sctp_m_freem(chk->data); chk->data = NULL; } asoc->ctrl_queue_cnt--; sctp_free_a_chunk(stcb, chk, SCTP_SO_NOT_LOCKED); /* sa_ignore NO_NULL_CHK */ stcb->asoc.str_reset = NULL; } static int sctp_handle_stream_reset_response(struct sctp_tcb *stcb, uint32_t seq, uint32_t action, struct sctp_stream_reset_response *respin) { uint16_t type; int lparm_len; struct sctp_association *asoc = &stcb->asoc; struct sctp_tmit_chunk *chk; struct sctp_stream_reset_request *req_param; struct sctp_stream_reset_out_request *req_out_param; struct sctp_stream_reset_in_request *req_in_param; uint32_t number_entries; if (asoc->stream_reset_outstanding == 0) { /* duplicate */ return (0); } if (seq == stcb->asoc.str_reset_seq_out) { req_param = sctp_find_stream_reset(stcb, seq, &chk); if (req_param != NULL) { stcb->asoc.str_reset_seq_out++; type = ntohs(req_param->ph.param_type); lparm_len = ntohs(req_param->ph.param_length); if (type == SCTP_STR_RESET_OUT_REQUEST) { int no_clear = 0; req_out_param = (struct sctp_stream_reset_out_request *)req_param; number_entries = (lparm_len - sizeof(struct sctp_stream_reset_out_request)) / sizeof(uint16_t); asoc->stream_reset_out_is_outstanding = 0; if (asoc->stream_reset_outstanding) asoc->stream_reset_outstanding--; if (action == SCTP_STREAM_RESET_RESULT_PERFORMED) { /* do it */ sctp_reset_out_streams(stcb, number_entries, req_out_param->list_of_streams); } else if (action == SCTP_STREAM_RESET_RESULT_DENIED) { sctp_ulp_notify(SCTP_NOTIFY_STR_RESET_DENIED_OUT, stcb, number_entries, req_out_param->list_of_streams, SCTP_SO_NOT_LOCKED); } else if (action == SCTP_STREAM_RESET_RESULT_IN_PROGRESS) { /* * Set it up so we don't stop * retransmitting */ asoc->stream_reset_outstanding++; stcb->asoc.str_reset_seq_out--; asoc->stream_reset_out_is_outstanding = 1; no_clear = 1; } else { sctp_ulp_notify(SCTP_NOTIFY_STR_RESET_FAILED_OUT, stcb, number_entries, req_out_param->list_of_streams, SCTP_SO_NOT_LOCKED); } if (no_clear == 0) { sctp_reset_clear_pending(stcb, number_entries, req_out_param->list_of_streams); } } else if (type == SCTP_STR_RESET_IN_REQUEST) { req_in_param = (struct sctp_stream_reset_in_request *)req_param; number_entries = (lparm_len - sizeof(struct sctp_stream_reset_in_request)) / sizeof(uint16_t); if (asoc->stream_reset_outstanding) asoc->stream_reset_outstanding--; if (action == SCTP_STREAM_RESET_RESULT_DENIED) { sctp_ulp_notify(SCTP_NOTIFY_STR_RESET_DENIED_IN, stcb, number_entries, req_in_param->list_of_streams, SCTP_SO_NOT_LOCKED); } else if (action != SCTP_STREAM_RESET_RESULT_PERFORMED) { sctp_ulp_notify(SCTP_NOTIFY_STR_RESET_FAILED_IN, stcb, number_entries, req_in_param->list_of_streams, SCTP_SO_NOT_LOCKED); } } else if (type == SCTP_STR_RESET_ADD_OUT_STREAMS) { /* Ok we now may have more streams */ int num_stream; num_stream = stcb->asoc.strm_pending_add_size; if (num_stream > (stcb->asoc.strm_realoutsize - stcb->asoc.streamoutcnt)) { /* TSNH */ num_stream = stcb->asoc.strm_realoutsize - stcb->asoc.streamoutcnt; } stcb->asoc.strm_pending_add_size = 0; if (asoc->stream_reset_outstanding) asoc->stream_reset_outstanding--; if (action == SCTP_STREAM_RESET_RESULT_PERFORMED) { /* Put the new streams into effect */ int i; for (i = asoc->streamoutcnt; i < (asoc->streamoutcnt + num_stream); i++) { asoc->strmout[i].state = SCTP_STREAM_OPEN; } asoc->streamoutcnt += num_stream; sctp_notify_stream_reset_add(stcb, stcb->asoc.streamincnt, stcb->asoc.streamoutcnt, 0); } else if (action == SCTP_STREAM_RESET_RESULT_DENIED) { sctp_notify_stream_reset_add(stcb, stcb->asoc.streamincnt, stcb->asoc.streamoutcnt, SCTP_STREAM_CHANGE_DENIED); } else { sctp_notify_stream_reset_add(stcb, stcb->asoc.streamincnt, stcb->asoc.streamoutcnt, SCTP_STREAM_CHANGE_FAILED); } } else if (type == SCTP_STR_RESET_ADD_IN_STREAMS) { if (asoc->stream_reset_outstanding) asoc->stream_reset_outstanding--; if (action == SCTP_STREAM_RESET_RESULT_DENIED) { sctp_notify_stream_reset_add(stcb, stcb->asoc.streamincnt, stcb->asoc.streamoutcnt, SCTP_STREAM_CHANGE_DENIED); } else if (action != SCTP_STREAM_RESET_RESULT_PERFORMED) { sctp_notify_stream_reset_add(stcb, stcb->asoc.streamincnt, stcb->asoc.streamoutcnt, SCTP_STREAM_CHANGE_FAILED); } } else if (type == SCTP_STR_RESET_TSN_REQUEST) { /** * a) Adopt the new in tsn. * b) reset the map * c) Adopt the new out-tsn */ struct sctp_stream_reset_response_tsn *resp; struct sctp_forward_tsn_chunk fwdtsn; int abort_flag = 0; if (respin == NULL) { /* huh ? */ return (0); } if (ntohs(respin->ph.param_length) < sizeof(struct sctp_stream_reset_response_tsn)) { return (0); } if (action == SCTP_STREAM_RESET_RESULT_PERFORMED) { resp = (struct sctp_stream_reset_response_tsn *)respin; asoc->stream_reset_outstanding--; fwdtsn.ch.chunk_length = htons(sizeof(struct sctp_forward_tsn_chunk)); fwdtsn.ch.chunk_type = SCTP_FORWARD_CUM_TSN; fwdtsn.new_cumulative_tsn = htonl(ntohl(resp->senders_next_tsn) - 1); sctp_handle_forward_tsn(stcb, &fwdtsn, &abort_flag, NULL, 0); if (abort_flag) { return (1); } stcb->asoc.highest_tsn_inside_map = (ntohl(resp->senders_next_tsn) - 1); if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MAP_LOGGING_ENABLE) { sctp_log_map(0, 7, asoc->highest_tsn_inside_map, SCTP_MAP_SLIDE_RESULT); } stcb->asoc.tsn_last_delivered = stcb->asoc.cumulative_tsn = stcb->asoc.highest_tsn_inside_map; stcb->asoc.mapping_array_base_tsn = ntohl(resp->senders_next_tsn); memset(stcb->asoc.mapping_array, 0, stcb->asoc.mapping_array_size); stcb->asoc.highest_tsn_inside_nr_map = stcb->asoc.highest_tsn_inside_map; memset(stcb->asoc.nr_mapping_array, 0, stcb->asoc.mapping_array_size); stcb->asoc.sending_seq = ntohl(resp->receivers_next_tsn); stcb->asoc.last_acked_seq = stcb->asoc.cumulative_tsn; sctp_reset_out_streams(stcb, 0, (uint16_t *)NULL); sctp_reset_in_stream(stcb, 0, (uint16_t *)NULL); sctp_notify_stream_reset_tsn(stcb, stcb->asoc.sending_seq, (stcb->asoc.mapping_array_base_tsn + 1), 0); } else if (action == SCTP_STREAM_RESET_RESULT_DENIED) { sctp_notify_stream_reset_tsn(stcb, stcb->asoc.sending_seq, (stcb->asoc.mapping_array_base_tsn + 1), SCTP_ASSOC_RESET_DENIED); } else { sctp_notify_stream_reset_tsn(stcb, stcb->asoc.sending_seq, (stcb->asoc.mapping_array_base_tsn + 1), SCTP_ASSOC_RESET_FAILED); } } /* get rid of the request and get the request flags */ if (asoc->stream_reset_outstanding == 0) { sctp_clean_up_stream_reset(stcb); } } } if (asoc->stream_reset_outstanding == 0) { sctp_send_stream_reset_out_if_possible(stcb, SCTP_SO_NOT_LOCKED); } return (0); } static void sctp_handle_str_reset_request_in(struct sctp_tcb *stcb, struct sctp_tmit_chunk *chk, struct sctp_stream_reset_in_request *req, int trunc) { uint32_t seq; int len, i; int number_entries; uint16_t temp; /* * peer wants me to send a str-reset to him for my outgoing seq's if * seq_in is right. */ struct sctp_association *asoc = &stcb->asoc; seq = ntohl(req->request_seq); if (asoc->str_reset_seq_in == seq) { asoc->last_reset_action[1] = asoc->last_reset_action[0]; if (!(asoc->local_strreset_support & SCTP_ENABLE_RESET_STREAM_REQ)) { asoc->last_reset_action[0] = SCTP_STREAM_RESET_RESULT_DENIED; } else if (trunc) { /* Can't do it, since they exceeded our buffer size */ asoc->last_reset_action[0] = SCTP_STREAM_RESET_RESULT_DENIED; } else if (stcb->asoc.stream_reset_out_is_outstanding == 0) { len = ntohs(req->ph.param_length); number_entries = ((len - sizeof(struct sctp_stream_reset_in_request)) / sizeof(uint16_t)); if (number_entries) { for (i = 0; i < number_entries; i++) { temp = ntohs(req->list_of_streams[i]); if (temp >= stcb->asoc.streamoutcnt) { asoc->last_reset_action[0] = SCTP_STREAM_RESET_RESULT_DENIED; goto bad_boy; } req->list_of_streams[i] = temp; } for (i = 0; i < number_entries; i++) { if (stcb->asoc.strmout[req->list_of_streams[i]].state == SCTP_STREAM_OPEN) { stcb->asoc.strmout[req->list_of_streams[i]].state = SCTP_STREAM_RESET_PENDING; } } } else { /* Its all */ for (i = 0; i < stcb->asoc.streamoutcnt; i++) { if (stcb->asoc.strmout[i].state == SCTP_STREAM_OPEN) stcb->asoc.strmout[i].state = SCTP_STREAM_RESET_PENDING; } } asoc->last_reset_action[0] = SCTP_STREAM_RESET_RESULT_PERFORMED; } else { /* Can't do it, since we have sent one out */ asoc->last_reset_action[0] = SCTP_STREAM_RESET_RESULT_ERR_IN_PROGRESS; } bad_boy: sctp_add_stream_reset_result(chk, seq, asoc->last_reset_action[0]); asoc->str_reset_seq_in++; } else if (asoc->str_reset_seq_in - 1 == seq) { sctp_add_stream_reset_result(chk, seq, asoc->last_reset_action[0]); } else if (asoc->str_reset_seq_in - 2 == seq) { sctp_add_stream_reset_result(chk, seq, asoc->last_reset_action[1]); } else { sctp_add_stream_reset_result(chk, seq, SCTP_STREAM_RESET_RESULT_ERR_BAD_SEQNO); } sctp_send_stream_reset_out_if_possible(stcb, SCTP_SO_NOT_LOCKED); } static int sctp_handle_str_reset_request_tsn(struct sctp_tcb *stcb, struct sctp_tmit_chunk *chk, struct sctp_stream_reset_tsn_request *req) { /* reset all in and out and update the tsn */ /* * A) reset my str-seq's on in and out. B) Select a receive next, * and set cum-ack to it. Also process this selected number as a * fwd-tsn as well. C) set in the response my next sending seq. */ struct sctp_forward_tsn_chunk fwdtsn; struct sctp_association *asoc = &stcb->asoc; int abort_flag = 0; uint32_t seq; seq = ntohl(req->request_seq); if (asoc->str_reset_seq_in == seq) { asoc->last_reset_action[1] = stcb->asoc.last_reset_action[0]; if (!(asoc->local_strreset_support & SCTP_ENABLE_CHANGE_ASSOC_REQ)) { asoc->last_reset_action[0] = SCTP_STREAM_RESET_RESULT_DENIED; } else { fwdtsn.ch.chunk_length = htons(sizeof(struct sctp_forward_tsn_chunk)); fwdtsn.ch.chunk_type = SCTP_FORWARD_CUM_TSN; fwdtsn.ch.chunk_flags = 0; fwdtsn.new_cumulative_tsn = htonl(stcb->asoc.highest_tsn_inside_map + 1); sctp_handle_forward_tsn(stcb, &fwdtsn, &abort_flag, NULL, 0); if (abort_flag) { return (1); } asoc->highest_tsn_inside_map += SCTP_STREAM_RESET_TSN_DELTA; if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MAP_LOGGING_ENABLE) { sctp_log_map(0, 10, asoc->highest_tsn_inside_map, SCTP_MAP_SLIDE_RESULT); } asoc->tsn_last_delivered = asoc->cumulative_tsn = asoc->highest_tsn_inside_map; asoc->mapping_array_base_tsn = asoc->highest_tsn_inside_map + 1; memset(asoc->mapping_array, 0, asoc->mapping_array_size); asoc->highest_tsn_inside_nr_map = asoc->highest_tsn_inside_map; memset(asoc->nr_mapping_array, 0, asoc->mapping_array_size); atomic_add_int(&asoc->sending_seq, 1); /* save off historical data for retrans */ asoc->last_sending_seq[1] = asoc->last_sending_seq[0]; asoc->last_sending_seq[0] = asoc->sending_seq; asoc->last_base_tsnsent[1] = asoc->last_base_tsnsent[0]; asoc->last_base_tsnsent[0] = asoc->mapping_array_base_tsn; sctp_reset_out_streams(stcb, 0, (uint16_t *)NULL); sctp_reset_in_stream(stcb, 0, (uint16_t *)NULL); asoc->last_reset_action[0] = SCTP_STREAM_RESET_RESULT_PERFORMED; sctp_notify_stream_reset_tsn(stcb, asoc->sending_seq, (asoc->mapping_array_base_tsn + 1), 0); } sctp_add_stream_reset_result_tsn(chk, seq, asoc->last_reset_action[0], asoc->last_sending_seq[0], asoc->last_base_tsnsent[0]); asoc->str_reset_seq_in++; } else if (asoc->str_reset_seq_in - 1 == seq) { sctp_add_stream_reset_result_tsn(chk, seq, asoc->last_reset_action[0], asoc->last_sending_seq[0], asoc->last_base_tsnsent[0]); } else if (asoc->str_reset_seq_in - 2 == seq) { sctp_add_stream_reset_result_tsn(chk, seq, asoc->last_reset_action[1], asoc->last_sending_seq[1], asoc->last_base_tsnsent[1]); } else { sctp_add_stream_reset_result(chk, seq, SCTP_STREAM_RESET_RESULT_ERR_BAD_SEQNO); } return (0); } static void sctp_handle_str_reset_request_out(struct sctp_tcb *stcb, struct sctp_tmit_chunk *chk, struct sctp_stream_reset_out_request *req, int trunc) { uint32_t seq, tsn; int number_entries, len; struct sctp_association *asoc = &stcb->asoc; seq = ntohl(req->request_seq); /* now if its not a duplicate we process it */ if (asoc->str_reset_seq_in == seq) { len = ntohs(req->ph.param_length); number_entries = ((len - sizeof(struct sctp_stream_reset_out_request)) / sizeof(uint16_t)); /* * the sender is resetting, handle the list issue.. we must * a) verify if we can do the reset, if so no problem b) If * we can't do the reset we must copy the request. c) queue * it, and setup the data in processor to trigger it off * when needed and dequeue all the queued data. */ tsn = ntohl(req->send_reset_at_tsn); /* move the reset action back one */ asoc->last_reset_action[1] = asoc->last_reset_action[0]; if (!(asoc->local_strreset_support & SCTP_ENABLE_RESET_STREAM_REQ)) { asoc->last_reset_action[0] = SCTP_STREAM_RESET_RESULT_DENIED; } else if (trunc) { asoc->last_reset_action[0] = SCTP_STREAM_RESET_RESULT_DENIED; } else if (SCTP_TSN_GE(asoc->cumulative_tsn, tsn)) { /* we can do it now */ sctp_reset_in_stream(stcb, number_entries, req->list_of_streams); asoc->last_reset_action[0] = SCTP_STREAM_RESET_RESULT_PERFORMED; } else { /* * we must queue it up and thus wait for the TSN's * to arrive that are at or before tsn */ struct sctp_stream_reset_list *liste; int siz; siz = sizeof(struct sctp_stream_reset_list) + (number_entries * sizeof(uint16_t)); SCTP_MALLOC(liste, struct sctp_stream_reset_list *, siz, SCTP_M_STRESET); if (liste == NULL) { /* gak out of memory */ asoc->last_reset_action[0] = SCTP_STREAM_RESET_RESULT_DENIED; sctp_add_stream_reset_result(chk, seq, asoc->last_reset_action[0]); return; } liste->seq = seq; liste->tsn = tsn; liste->number_entries = number_entries; memcpy(&liste->list_of_streams, req->list_of_streams, number_entries * sizeof(uint16_t)); TAILQ_INSERT_TAIL(&asoc->resetHead, liste, next_resp); asoc->last_reset_action[0] = SCTP_STREAM_RESET_RESULT_IN_PROGRESS; } sctp_add_stream_reset_result(chk, seq, asoc->last_reset_action[0]); asoc->str_reset_seq_in++; } else if ((asoc->str_reset_seq_in - 1) == seq) { /* * one seq back, just echo back last action since my * response was lost. */ sctp_add_stream_reset_result(chk, seq, asoc->last_reset_action[0]); } else if ((asoc->str_reset_seq_in - 2) == seq) { /* * two seq back, just echo back last action since my * response was lost. */ sctp_add_stream_reset_result(chk, seq, asoc->last_reset_action[1]); } else { sctp_add_stream_reset_result(chk, seq, SCTP_STREAM_RESET_RESULT_ERR_BAD_SEQNO); } } static void sctp_handle_str_reset_add_strm(struct sctp_tcb *stcb, struct sctp_tmit_chunk *chk, struct sctp_stream_reset_add_strm *str_add) { /* * Peer is requesting to add more streams. If its within our * max-streams we will allow it. */ uint32_t num_stream, i; uint32_t seq; struct sctp_association *asoc = &stcb->asoc; struct sctp_queued_to_read *ctl, *nctl; /* Get the number. */ seq = ntohl(str_add->request_seq); num_stream = ntohs(str_add->number_of_streams); /* Now what would be the new total? */ if (asoc->str_reset_seq_in == seq) { num_stream += stcb->asoc.streamincnt; stcb->asoc.last_reset_action[1] = stcb->asoc.last_reset_action[0]; if (!(asoc->local_strreset_support & SCTP_ENABLE_CHANGE_ASSOC_REQ)) { asoc->last_reset_action[0] = SCTP_STREAM_RESET_RESULT_DENIED; } else if ((num_stream > stcb->asoc.max_inbound_streams) || (num_stream > 0xffff)) { /* We must reject it they ask for to many */ denied: stcb->asoc.last_reset_action[0] = SCTP_STREAM_RESET_RESULT_DENIED; } else { /* Ok, we can do that :-) */ struct sctp_stream_in *oldstrm; /* save off the old */ oldstrm = stcb->asoc.strmin; SCTP_MALLOC(stcb->asoc.strmin, struct sctp_stream_in *, (num_stream * sizeof(struct sctp_stream_in)), SCTP_M_STRMI); if (stcb->asoc.strmin == NULL) { stcb->asoc.strmin = oldstrm; goto denied; } /* copy off the old data */ for (i = 0; i < stcb->asoc.streamincnt; i++) { TAILQ_INIT(&stcb->asoc.strmin[i].inqueue); TAILQ_INIT(&stcb->asoc.strmin[i].uno_inqueue); stcb->asoc.strmin[i].sid = i; stcb->asoc.strmin[i].last_mid_delivered = oldstrm[i].last_mid_delivered; stcb->asoc.strmin[i].delivery_started = oldstrm[i].delivery_started; stcb->asoc.strmin[i].pd_api_started = oldstrm[i].pd_api_started; /* now anything on those queues? */ TAILQ_FOREACH_SAFE(ctl, &oldstrm[i].inqueue, next_instrm, nctl) { TAILQ_REMOVE(&oldstrm[i].inqueue, ctl, next_instrm); TAILQ_INSERT_TAIL(&stcb->asoc.strmin[i].inqueue, ctl, next_instrm); } TAILQ_FOREACH_SAFE(ctl, &oldstrm[i].uno_inqueue, next_instrm, nctl) { TAILQ_REMOVE(&oldstrm[i].uno_inqueue, ctl, next_instrm); TAILQ_INSERT_TAIL(&stcb->asoc.strmin[i].uno_inqueue, ctl, next_instrm); } } /* Init the new streams */ for (i = stcb->asoc.streamincnt; i < num_stream; i++) { TAILQ_INIT(&stcb->asoc.strmin[i].inqueue); TAILQ_INIT(&stcb->asoc.strmin[i].uno_inqueue); stcb->asoc.strmin[i].sid = i; stcb->asoc.strmin[i].last_mid_delivered = 0xffffffff; stcb->asoc.strmin[i].pd_api_started = 0; stcb->asoc.strmin[i].delivery_started = 0; } SCTP_FREE(oldstrm, SCTP_M_STRMI); /* update the size */ stcb->asoc.streamincnt = num_stream; stcb->asoc.last_reset_action[0] = SCTP_STREAM_RESET_RESULT_PERFORMED; sctp_notify_stream_reset_add(stcb, stcb->asoc.streamincnt, stcb->asoc.streamoutcnt, 0); } sctp_add_stream_reset_result(chk, seq, asoc->last_reset_action[0]); asoc->str_reset_seq_in++; } else if ((asoc->str_reset_seq_in - 1) == seq) { /* * one seq back, just echo back last action since my * response was lost. */ sctp_add_stream_reset_result(chk, seq, asoc->last_reset_action[0]); } else if ((asoc->str_reset_seq_in - 2) == seq) { /* * two seq back, just echo back last action since my * response was lost. */ sctp_add_stream_reset_result(chk, seq, asoc->last_reset_action[1]); } else { sctp_add_stream_reset_result(chk, seq, SCTP_STREAM_RESET_RESULT_ERR_BAD_SEQNO); } } static void sctp_handle_str_reset_add_out_strm(struct sctp_tcb *stcb, struct sctp_tmit_chunk *chk, struct sctp_stream_reset_add_strm *str_add) { /* * Peer is requesting to add more streams. If its within our * max-streams we will allow it. */ uint16_t num_stream; uint32_t seq; struct sctp_association *asoc = &stcb->asoc; /* Get the number. */ seq = ntohl(str_add->request_seq); num_stream = ntohs(str_add->number_of_streams); /* Now what would be the new total? */ if (asoc->str_reset_seq_in == seq) { stcb->asoc.last_reset_action[1] = stcb->asoc.last_reset_action[0]; if (!(asoc->local_strreset_support & SCTP_ENABLE_CHANGE_ASSOC_REQ)) { asoc->last_reset_action[0] = SCTP_STREAM_RESET_RESULT_DENIED; } else if (stcb->asoc.stream_reset_outstanding) { /* We must reject it we have something pending */ stcb->asoc.last_reset_action[0] = SCTP_STREAM_RESET_RESULT_ERR_IN_PROGRESS; } else { /* Ok, we can do that :-) */ int mychk; mychk = stcb->asoc.streamoutcnt; mychk += num_stream; if (mychk < 0x10000) { stcb->asoc.last_reset_action[0] = SCTP_STREAM_RESET_RESULT_PERFORMED; if (sctp_send_str_reset_req(stcb, 0, NULL, 0, 0, 1, num_stream, 0, 1)) { stcb->asoc.last_reset_action[0] = SCTP_STREAM_RESET_RESULT_DENIED; } } else { stcb->asoc.last_reset_action[0] = SCTP_STREAM_RESET_RESULT_DENIED; } } sctp_add_stream_reset_result(chk, seq, stcb->asoc.last_reset_action[0]); asoc->str_reset_seq_in++; } else if ((asoc->str_reset_seq_in - 1) == seq) { /* * one seq back, just echo back last action since my * response was lost. */ sctp_add_stream_reset_result(chk, seq, asoc->last_reset_action[0]); } else if ((asoc->str_reset_seq_in - 2) == seq) { /* * two seq back, just echo back last action since my * response was lost. */ sctp_add_stream_reset_result(chk, seq, asoc->last_reset_action[1]); } else { sctp_add_stream_reset_result(chk, seq, SCTP_STREAM_RESET_RESULT_ERR_BAD_SEQNO); } } #ifdef __GNUC__ __attribute__((noinline)) #endif static int sctp_handle_stream_reset(struct sctp_tcb *stcb, struct mbuf *m, int offset, struct sctp_chunkhdr *ch_req) { uint16_t remaining_length, param_len, ptype; struct sctp_paramhdr pstore; uint8_t cstore[SCTP_CHUNK_BUFFER_SIZE]; uint32_t seq = 0; int num_req = 0; int trunc = 0; struct sctp_tmit_chunk *chk; struct sctp_chunkhdr *ch; struct sctp_paramhdr *ph; int ret_code = 0; int num_param = 0; /* now it may be a reset or a reset-response */ remaining_length = ntohs(ch_req->chunk_length) - sizeof(struct sctp_chunkhdr); /* setup for adding the response */ sctp_alloc_a_chunk(stcb, chk); if (chk == NULL) { return (ret_code); } chk->copy_by_ref = 0; chk->rec.chunk_id.id = SCTP_STREAM_RESET; chk->rec.chunk_id.can_take_data = 0; chk->flags = 0; chk->asoc = &stcb->asoc; chk->no_fr_allowed = 0; chk->book_size = chk->send_size = sizeof(struct sctp_chunkhdr); chk->book_size_scale = 0; chk->data = sctp_get_mbuf_for_msg(MCLBYTES, 0, M_NOWAIT, 1, MT_DATA); if (chk->data == NULL) { strres_nochunk: if (chk->data) { sctp_m_freem(chk->data); chk->data = NULL; } sctp_free_a_chunk(stcb, chk, SCTP_SO_NOT_LOCKED); return (ret_code); } SCTP_BUF_RESV_UF(chk->data, SCTP_MIN_OVERHEAD); /* setup chunk parameters */ chk->sent = SCTP_DATAGRAM_UNSENT; chk->snd_count = 0; chk->whoTo = NULL; ch = mtod(chk->data, struct sctp_chunkhdr *); ch->chunk_type = SCTP_STREAM_RESET; ch->chunk_flags = 0; ch->chunk_length = htons(chk->send_size); SCTP_BUF_LEN(chk->data) = SCTP_SIZE32(chk->send_size); offset += sizeof(struct sctp_chunkhdr); while (remaining_length >= sizeof(struct sctp_paramhdr)) { ph = (struct sctp_paramhdr *)sctp_m_getptr(m, offset, sizeof(pstore), (uint8_t *)&pstore); if (ph == NULL) { /* TSNH */ break; } param_len = ntohs(ph->param_length); if ((param_len > remaining_length) || (param_len < (sizeof(struct sctp_paramhdr) + sizeof(uint32_t)))) { /* bad parameter length */ break; } ph = (struct sctp_paramhdr *)sctp_m_getptr(m, offset, min(param_len, sizeof(cstore)), (uint8_t *)&cstore); if (ph == NULL) { /* TSNH */ break; } ptype = ntohs(ph->param_type); num_param++; if (param_len > sizeof(cstore)) { trunc = 1; } else { trunc = 0; } if (num_param > SCTP_MAX_RESET_PARAMS) { /* hit the max of parameters already sorry.. */ break; } if (ptype == SCTP_STR_RESET_OUT_REQUEST) { struct sctp_stream_reset_out_request *req_out; if (param_len < sizeof(struct sctp_stream_reset_out_request)) { break; } req_out = (struct sctp_stream_reset_out_request *)ph; num_req++; if (stcb->asoc.stream_reset_outstanding) { seq = ntohl(req_out->response_seq); if (seq == stcb->asoc.str_reset_seq_out) { /* implicit ack */ (void)sctp_handle_stream_reset_response(stcb, seq, SCTP_STREAM_RESET_RESULT_PERFORMED, NULL); } } sctp_handle_str_reset_request_out(stcb, chk, req_out, trunc); } else if (ptype == SCTP_STR_RESET_ADD_OUT_STREAMS) { struct sctp_stream_reset_add_strm *str_add; if (param_len < sizeof(struct sctp_stream_reset_add_strm)) { break; } str_add = (struct sctp_stream_reset_add_strm *)ph; num_req++; sctp_handle_str_reset_add_strm(stcb, chk, str_add); } else if (ptype == SCTP_STR_RESET_ADD_IN_STREAMS) { struct sctp_stream_reset_add_strm *str_add; if (param_len < sizeof(struct sctp_stream_reset_add_strm)) { break; } str_add = (struct sctp_stream_reset_add_strm *)ph; num_req++; sctp_handle_str_reset_add_out_strm(stcb, chk, str_add); } else if (ptype == SCTP_STR_RESET_IN_REQUEST) { struct sctp_stream_reset_in_request *req_in; num_req++; req_in = (struct sctp_stream_reset_in_request *)ph; sctp_handle_str_reset_request_in(stcb, chk, req_in, trunc); } else if (ptype == SCTP_STR_RESET_TSN_REQUEST) { struct sctp_stream_reset_tsn_request *req_tsn; num_req++; req_tsn = (struct sctp_stream_reset_tsn_request *)ph; if (sctp_handle_str_reset_request_tsn(stcb, chk, req_tsn)) { ret_code = 1; goto strres_nochunk; } /* no more */ break; } else if (ptype == SCTP_STR_RESET_RESPONSE) { struct sctp_stream_reset_response *resp; uint32_t result; if (param_len < sizeof(struct sctp_stream_reset_response)) { break; } resp = (struct sctp_stream_reset_response *)ph; seq = ntohl(resp->response_seq); result = ntohl(resp->result); if (sctp_handle_stream_reset_response(stcb, seq, result, resp)) { ret_code = 1; goto strres_nochunk; } } else { break; } offset += SCTP_SIZE32(param_len); if (remaining_length >= SCTP_SIZE32(param_len)) { remaining_length -= SCTP_SIZE32(param_len); } else { remaining_length = 0; } } if (num_req == 0) { /* we have no response free the stuff */ goto strres_nochunk; } /* ok we have a chunk to link in */ TAILQ_INSERT_TAIL(&stcb->asoc.control_send_queue, chk, sctp_next); stcb->asoc.ctrl_queue_cnt++; return (ret_code); } /* * Handle a router or endpoints report of a packet loss, there are two ways * to handle this, either we get the whole packet and must disect it * ourselves (possibly with truncation and or corruption) or it is a summary * from a middle box that did the disectting for us. */ static void sctp_handle_packet_dropped(struct sctp_pktdrop_chunk *cp, struct sctp_tcb *stcb, struct sctp_nets *net, uint32_t limit) { uint32_t bottle_bw, on_queue; uint16_t trunc_len; unsigned int chlen; unsigned int at; struct sctp_chunk_desc desc; struct sctp_chunkhdr *ch; chlen = ntohs(cp->ch.chunk_length); chlen -= sizeof(struct sctp_pktdrop_chunk); /* XXX possible chlen underflow */ if (chlen == 0) { ch = NULL; if (cp->ch.chunk_flags & SCTP_FROM_MIDDLE_BOX) SCTP_STAT_INCR(sctps_pdrpbwrpt); } else { ch = (struct sctp_chunkhdr *)(cp->data + sizeof(struct sctphdr)); chlen -= sizeof(struct sctphdr); /* XXX possible chlen underflow */ memset(&desc, 0, sizeof(desc)); } trunc_len = (uint16_t)ntohs(cp->trunc_len); if (trunc_len > limit) { trunc_len = limit; } /* now the chunks themselves */ while ((ch != NULL) && (chlen >= sizeof(struct sctp_chunkhdr))) { desc.chunk_type = ch->chunk_type; /* get amount we need to move */ at = ntohs(ch->chunk_length); if (at < sizeof(struct sctp_chunkhdr)) { /* corrupt chunk, maybe at the end? */ SCTP_STAT_INCR(sctps_pdrpcrupt); break; } if (trunc_len == 0) { /* we are supposed to have all of it */ if (at > chlen) { /* corrupt skip it */ SCTP_STAT_INCR(sctps_pdrpcrupt); break; } } else { /* is there enough of it left ? */ if (desc.chunk_type == SCTP_DATA) { if (chlen < (sizeof(struct sctp_data_chunk) + sizeof(desc.data_bytes))) { break; } } else { if (chlen < sizeof(struct sctp_chunkhdr)) { break; } } } if (desc.chunk_type == SCTP_DATA) { /* can we get out the tsn? */ if ((cp->ch.chunk_flags & SCTP_FROM_MIDDLE_BOX)) SCTP_STAT_INCR(sctps_pdrpmbda); if (chlen >= (sizeof(struct sctp_data_chunk) + sizeof(uint32_t))) { /* yep */ struct sctp_data_chunk *dcp; uint8_t *ddp; unsigned int iii; dcp = (struct sctp_data_chunk *)ch; ddp = (uint8_t *)(dcp + 1); for (iii = 0; iii < sizeof(desc.data_bytes); iii++) { desc.data_bytes[iii] = ddp[iii]; } desc.tsn_ifany = dcp->dp.tsn; } else { /* nope we are done. */ SCTP_STAT_INCR(sctps_pdrpnedat); break; } } else { if ((cp->ch.chunk_flags & SCTP_FROM_MIDDLE_BOX)) SCTP_STAT_INCR(sctps_pdrpmbct); } if (process_chunk_drop(stcb, &desc, net, cp->ch.chunk_flags)) { SCTP_STAT_INCR(sctps_pdrppdbrk); break; } if (SCTP_SIZE32(at) > chlen) { break; } chlen -= SCTP_SIZE32(at); if (chlen < sizeof(struct sctp_chunkhdr)) { /* done, none left */ break; } ch = (struct sctp_chunkhdr *)((caddr_t)ch + SCTP_SIZE32(at)); } /* Now update any rwnd --- possibly */ if ((cp->ch.chunk_flags & SCTP_FROM_MIDDLE_BOX) == 0) { /* From a peer, we get a rwnd report */ uint32_t a_rwnd; SCTP_STAT_INCR(sctps_pdrpfehos); bottle_bw = ntohl(cp->bottle_bw); on_queue = ntohl(cp->current_onq); if (bottle_bw && on_queue) { /* a rwnd report is in here */ if (bottle_bw > on_queue) a_rwnd = bottle_bw - on_queue; else a_rwnd = 0; if (a_rwnd == 0) stcb->asoc.peers_rwnd = 0; else { if (a_rwnd > stcb->asoc.total_flight) { stcb->asoc.peers_rwnd = a_rwnd - stcb->asoc.total_flight; } else { stcb->asoc.peers_rwnd = 0; } if (stcb->asoc.peers_rwnd < stcb->sctp_ep->sctp_ep.sctp_sws_sender) { /* SWS sender side engages */ stcb->asoc.peers_rwnd = 0; } } } } else { SCTP_STAT_INCR(sctps_pdrpfmbox); } /* now middle boxes in sat networks get a cwnd bump */ if ((cp->ch.chunk_flags & SCTP_FROM_MIDDLE_BOX) && (stcb->asoc.sat_t3_loss_recovery == 0) && (stcb->asoc.sat_network)) { /* * This is debatable but for sat networks it makes sense * Note if a T3 timer has went off, we will prohibit any * changes to cwnd until we exit the t3 loss recovery. */ stcb->asoc.cc_functions.sctp_cwnd_update_after_packet_dropped(stcb, net, cp, &bottle_bw, &on_queue); } } /* * handles all control chunks in a packet inputs: - m: mbuf chain, assumed to * still contain IP/SCTP header - stcb: is the tcb found for this packet - * offset: offset into the mbuf chain to first chunkhdr - length: is the * length of the complete packet outputs: - length: modified to remaining * length after control processing - netp: modified to new sctp_nets after * cookie-echo processing - return NULL to discard the packet (ie. no asoc, * bad packet,...) otherwise return the tcb for this packet */ #ifdef __GNUC__ __attribute__((noinline)) #endif static struct sctp_tcb * sctp_process_control(struct mbuf *m, int iphlen, int *offset, int length, struct sockaddr *src, struct sockaddr *dst, struct sctphdr *sh, struct sctp_chunkhdr *ch, struct sctp_inpcb *inp, struct sctp_tcb *stcb, struct sctp_nets **netp, int *fwd_tsn_seen, uint8_t mflowtype, uint32_t mflowid, uint16_t fibnum, uint32_t vrf_id, uint16_t port) { struct sctp_association *asoc; struct mbuf *op_err; char msg[SCTP_DIAG_INFO_LEN]; uint32_t vtag_in; int num_chunks = 0; /* number of control chunks processed */ uint32_t chk_length; int ret; int abort_no_unlock = 0; int ecne_seen = 0; /* * How big should this be, and should it be alloc'd? Lets try the * d-mtu-ceiling for now (2k) and that should hopefully work ... * until we get into jumbo grams and such.. */ uint8_t chunk_buf[SCTP_CHUNK_BUFFER_SIZE]; struct sctp_tcb *locked_tcb = stcb; int got_auth = 0; uint32_t auth_offset = 0, auth_len = 0; int auth_skipped = 0; int asconf_cnt = 0; #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) struct socket *so; #endif SCTPDBG(SCTP_DEBUG_INPUT1, "sctp_process_control: iphlen=%u, offset=%u, length=%u stcb:%p\n", iphlen, *offset, length, (void *)stcb); /* validate chunk header length... */ if (ntohs(ch->chunk_length) < sizeof(*ch)) { SCTPDBG(SCTP_DEBUG_INPUT1, "Invalid header length %d\n", ntohs(ch->chunk_length)); if (locked_tcb) { SCTP_TCB_UNLOCK(locked_tcb); } return (NULL); } /* * validate the verification tag */ vtag_in = ntohl(sh->v_tag); if (locked_tcb) { SCTP_TCB_LOCK_ASSERT(locked_tcb); } if (ch->chunk_type == SCTP_INITIATION) { SCTPDBG(SCTP_DEBUG_INPUT1, "Its an INIT of len:%d vtag:%x\n", ntohs(ch->chunk_length), vtag_in); if (vtag_in != 0) { /* protocol error- silently discard... */ SCTP_STAT_INCR(sctps_badvtag); if (locked_tcb) { SCTP_TCB_UNLOCK(locked_tcb); } return (NULL); } } else if (ch->chunk_type != SCTP_COOKIE_ECHO) { /* * If there is no stcb, skip the AUTH chunk and process * later after a stcb is found (to validate the lookup was * valid. */ if ((ch->chunk_type == SCTP_AUTHENTICATION) && (stcb == NULL) && (inp->auth_supported == 1)) { /* save this chunk for later processing */ auth_skipped = 1; auth_offset = *offset; auth_len = ntohs(ch->chunk_length); /* (temporarily) move past this chunk */ *offset += SCTP_SIZE32(auth_len); if (*offset >= length) { /* no more data left in the mbuf chain */ *offset = length; if (locked_tcb) { SCTP_TCB_UNLOCK(locked_tcb); } return (NULL); } ch = (struct sctp_chunkhdr *)sctp_m_getptr(m, *offset, sizeof(struct sctp_chunkhdr), chunk_buf); } if (ch == NULL) { /* Help */ *offset = length; if (locked_tcb) { SCTP_TCB_UNLOCK(locked_tcb); } return (NULL); } if (ch->chunk_type == SCTP_COOKIE_ECHO) { goto process_control_chunks; } /* * first check if it's an ASCONF with an unknown src addr we * need to look inside to find the association */ if (ch->chunk_type == SCTP_ASCONF && stcb == NULL) { struct sctp_chunkhdr *asconf_ch = ch; uint32_t asconf_offset = 0, asconf_len = 0; /* inp's refcount may be reduced */ SCTP_INP_INCR_REF(inp); asconf_offset = *offset; do { asconf_len = ntohs(asconf_ch->chunk_length); if (asconf_len < sizeof(struct sctp_asconf_paramhdr)) break; stcb = sctp_findassociation_ep_asconf(m, *offset, dst, sh, &inp, netp, vrf_id); if (stcb != NULL) break; asconf_offset += SCTP_SIZE32(asconf_len); asconf_ch = (struct sctp_chunkhdr *)sctp_m_getptr(m, asconf_offset, sizeof(struct sctp_chunkhdr), chunk_buf); } while (asconf_ch != NULL && asconf_ch->chunk_type == SCTP_ASCONF); if (stcb == NULL) { /* * reduce inp's refcount if not reduced in * sctp_findassociation_ep_asconf(). */ SCTP_INP_DECR_REF(inp); } else { locked_tcb = stcb; } /* now go back and verify any auth chunk to be sure */ if (auth_skipped && (stcb != NULL)) { struct sctp_auth_chunk *auth; auth = (struct sctp_auth_chunk *) sctp_m_getptr(m, auth_offset, auth_len, chunk_buf); got_auth = 1; auth_skipped = 0; if ((auth == NULL) || sctp_handle_auth(stcb, auth, m, auth_offset)) { /* auth HMAC failed so dump it */ *offset = length; if (locked_tcb) { SCTP_TCB_UNLOCK(locked_tcb); } return (NULL); } else { /* remaining chunks are HMAC checked */ stcb->asoc.authenticated = 1; } } } if (stcb == NULL) { snprintf(msg, sizeof(msg), "OOTB, %s:%d at %s", __FILE__, __LINE__, __func__); op_err = sctp_generate_cause(SCTP_BASE_SYSCTL(sctp_diag_info_code), msg); /* no association, so it's out of the blue... */ sctp_handle_ootb(m, iphlen, *offset, src, dst, sh, inp, op_err, mflowtype, mflowid, inp->fibnum, vrf_id, port); *offset = length; if (locked_tcb) { SCTP_TCB_UNLOCK(locked_tcb); } return (NULL); } asoc = &stcb->asoc; /* ABORT and SHUTDOWN can use either v_tag... */ if ((ch->chunk_type == SCTP_ABORT_ASSOCIATION) || (ch->chunk_type == SCTP_SHUTDOWN_COMPLETE) || (ch->chunk_type == SCTP_PACKET_DROPPED)) { /* Take the T-bit always into account. */ if ((((ch->chunk_flags & SCTP_HAD_NO_TCB) == 0) && (vtag_in == asoc->my_vtag)) || (((ch->chunk_flags & SCTP_HAD_NO_TCB) == SCTP_HAD_NO_TCB) && (vtag_in == asoc->peer_vtag))) { /* this is valid */ } else { /* drop this packet... */ SCTP_STAT_INCR(sctps_badvtag); if (locked_tcb) { SCTP_TCB_UNLOCK(locked_tcb); } return (NULL); } } else if (ch->chunk_type == SCTP_SHUTDOWN_ACK) { if (vtag_in != asoc->my_vtag) { /* * this could be a stale SHUTDOWN-ACK or the * peer never got the SHUTDOWN-COMPLETE and * is still hung; we have started a new asoc * but it won't complete until the shutdown * is completed */ if (locked_tcb) { SCTP_TCB_UNLOCK(locked_tcb); } snprintf(msg, sizeof(msg), "OOTB, %s:%d at %s", __FILE__, __LINE__, __func__); op_err = sctp_generate_cause(SCTP_BASE_SYSCTL(sctp_diag_info_code), msg); sctp_handle_ootb(m, iphlen, *offset, src, dst, sh, inp, op_err, mflowtype, mflowid, fibnum, vrf_id, port); return (NULL); } } else { /* for all other chunks, vtag must match */ if (vtag_in != asoc->my_vtag) { /* invalid vtag... */ SCTPDBG(SCTP_DEBUG_INPUT3, "invalid vtag: %xh, expect %xh\n", vtag_in, asoc->my_vtag); SCTP_STAT_INCR(sctps_badvtag); if (locked_tcb) { SCTP_TCB_UNLOCK(locked_tcb); } *offset = length; return (NULL); } } } /* end if !SCTP_COOKIE_ECHO */ /* * process all control chunks... */ if (((ch->chunk_type == SCTP_SELECTIVE_ACK) || (ch->chunk_type == SCTP_NR_SELECTIVE_ACK) || (ch->chunk_type == SCTP_HEARTBEAT_REQUEST)) && (SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_COOKIE_ECHOED)) { /* implied cookie-ack.. we must have lost the ack */ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_THRESHOLD_LOGGING) { sctp_misc_ints(SCTP_THRESHOLD_CLEAR, stcb->asoc.overall_error_count, 0, SCTP_FROM_SCTP_INPUT, __LINE__); } stcb->asoc.overall_error_count = 0; sctp_handle_cookie_ack((struct sctp_cookie_ack_chunk *)ch, stcb, *netp); } process_control_chunks: while (IS_SCTP_CONTROL(ch)) { /* validate chunk length */ chk_length = ntohs(ch->chunk_length); SCTPDBG(SCTP_DEBUG_INPUT2, "sctp_process_control: processing a chunk type=%u, len=%u\n", ch->chunk_type, chk_length); SCTP_LTRACE_CHK(inp, stcb, ch->chunk_type, chk_length); if (chk_length < sizeof(*ch) || (*offset + (int)chk_length) > length) { *offset = length; if (locked_tcb) { SCTP_TCB_UNLOCK(locked_tcb); } return (NULL); } SCTP_STAT_INCR_COUNTER64(sctps_incontrolchunks); /* * INIT-ACK only gets the init ack "header" portion only * because we don't have to process the peer's COOKIE. All * others get a complete chunk. */ if ((ch->chunk_type == SCTP_INITIATION_ACK) || (ch->chunk_type == SCTP_INITIATION)) { /* get an init-ack chunk */ ch = (struct sctp_chunkhdr *)sctp_m_getptr(m, *offset, sizeof(struct sctp_init_ack_chunk), chunk_buf); if (ch == NULL) { *offset = length; if (locked_tcb) { SCTP_TCB_UNLOCK(locked_tcb); } return (NULL); } } else { /* For cookies and all other chunks. */ if (chk_length > sizeof(chunk_buf)) { /* * use just the size of the chunk buffer so * the front part of our chunks fit in * contiguous space up to the chunk buffer * size (508 bytes). For chunks that need to * get more than that they must use the * sctp_m_getptr() function or other means * (e.g. know how to parse mbuf chains). * Cookies do this already. */ ch = (struct sctp_chunkhdr *)sctp_m_getptr(m, *offset, (sizeof(chunk_buf) - 4), chunk_buf); if (ch == NULL) { *offset = length; if (locked_tcb) { SCTP_TCB_UNLOCK(locked_tcb); } return (NULL); } } else { /* We can fit it all */ ch = (struct sctp_chunkhdr *)sctp_m_getptr(m, *offset, chk_length, chunk_buf); if (ch == NULL) { SCTP_PRINTF("sctp_process_control: Can't get the all data....\n"); *offset = length; if (locked_tcb) { SCTP_TCB_UNLOCK(locked_tcb); } return (NULL); } } } num_chunks++; /* Save off the last place we got a control from */ if (stcb != NULL) { if (((netp != NULL) && (*netp != NULL)) || (ch->chunk_type == SCTP_ASCONF)) { /* * allow last_control to be NULL if * ASCONF... ASCONF processing will find the * right net later */ if ((netp != NULL) && (*netp != NULL)) stcb->asoc.last_control_chunk_from = *netp; } } #ifdef SCTP_AUDITING_ENABLED sctp_audit_log(0xB0, ch->chunk_type); #endif /* check to see if this chunk required auth, but isn't */ if ((stcb != NULL) && (stcb->asoc.auth_supported == 1) && sctp_auth_is_required_chunk(ch->chunk_type, stcb->asoc.local_auth_chunks) && !stcb->asoc.authenticated) { /* "silently" ignore */ SCTP_STAT_INCR(sctps_recvauthmissing); goto next_chunk; } switch (ch->chunk_type) { case SCTP_INITIATION: SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_INIT\n"); /* The INIT chunk must be the only chunk. */ if ((num_chunks > 1) || (length - *offset > (int)SCTP_SIZE32(chk_length))) { /* RFC 4960 requires that no ABORT is sent */ *offset = length; if (locked_tcb) { SCTP_TCB_UNLOCK(locked_tcb); } return (NULL); } /* Honor our resource limit. */ if (chk_length > SCTP_LARGEST_INIT_ACCEPTED) { op_err = sctp_generate_cause(SCTP_CAUSE_OUT_OF_RESC, ""); sctp_abort_association(inp, stcb, m, iphlen, src, dst, sh, op_err, mflowtype, mflowid, vrf_id, port); *offset = length; return (NULL); } sctp_handle_init(m, iphlen, *offset, src, dst, sh, (struct sctp_init_chunk *)ch, inp, stcb, *netp, &abort_no_unlock, mflowtype, mflowid, vrf_id, port); *offset = length; if ((!abort_no_unlock) && (locked_tcb)) { SCTP_TCB_UNLOCK(locked_tcb); } return (NULL); break; case SCTP_PAD_CHUNK: break; case SCTP_INITIATION_ACK: SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_INIT-ACK\n"); if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) { /* We are not interested anymore */ if ((stcb) && (stcb->asoc.total_output_queue_size)) { ; } else { if ((locked_tcb != NULL) && (locked_tcb != stcb)) { /* Very unlikely */ SCTP_TCB_UNLOCK(locked_tcb); } *offset = length; if (stcb) { #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) so = SCTP_INP_SO(inp); atomic_add_int(&stcb->asoc.refcnt, 1); SCTP_TCB_UNLOCK(stcb); SCTP_SOCKET_LOCK(so, 1); SCTP_TCB_LOCK(stcb); atomic_subtract_int(&stcb->asoc.refcnt, 1); #endif (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTP_INPUT + SCTP_LOC_29); #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) SCTP_SOCKET_UNLOCK(so, 1); #endif } return (NULL); } } /* The INIT-ACK chunk must be the only chunk. */ if ((num_chunks > 1) || (length - *offset > (int)SCTP_SIZE32(chk_length))) { *offset = length; if (locked_tcb) { SCTP_TCB_UNLOCK(locked_tcb); } return (NULL); } if ((netp) && (*netp)) { ret = sctp_handle_init_ack(m, iphlen, *offset, src, dst, sh, (struct sctp_init_ack_chunk *)ch, stcb, *netp, &abort_no_unlock, mflowtype, mflowid, vrf_id); } else { ret = -1; } *offset = length; if (abort_no_unlock) { return (NULL); } /* * Special case, I must call the output routine to * get the cookie echoed */ if ((stcb != NULL) && (ret == 0)) { sctp_chunk_output(stcb->sctp_ep, stcb, SCTP_OUTPUT_FROM_CONTROL_PROC, SCTP_SO_NOT_LOCKED); } if (locked_tcb) { SCTP_TCB_UNLOCK(locked_tcb); } return (NULL); break; case SCTP_SELECTIVE_ACK: { struct sctp_sack_chunk *sack; int abort_now = 0; uint32_t a_rwnd, cum_ack; uint16_t num_seg, num_dup; uint8_t flags; int offset_seg, offset_dup; SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_SACK\n"); SCTP_STAT_INCR(sctps_recvsacks); if (stcb == NULL) { SCTPDBG(SCTP_DEBUG_INDATA1, "No stcb when processing SACK chunk\n"); break; } if (chk_length < sizeof(struct sctp_sack_chunk)) { SCTPDBG(SCTP_DEBUG_INDATA1, "Bad size on SACK chunk, too small\n"); break; } if (SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_SHUTDOWN_ACK_SENT) { /*- * If we have sent a shutdown-ack, we will pay no * attention to a sack sent in to us since * we don't care anymore. */ break; } sack = (struct sctp_sack_chunk *)ch; flags = ch->chunk_flags; cum_ack = ntohl(sack->sack.cum_tsn_ack); num_seg = ntohs(sack->sack.num_gap_ack_blks); num_dup = ntohs(sack->sack.num_dup_tsns); a_rwnd = (uint32_t)ntohl(sack->sack.a_rwnd); if (sizeof(struct sctp_sack_chunk) + num_seg * sizeof(struct sctp_gap_ack_block) + num_dup * sizeof(uint32_t) != chk_length) { SCTPDBG(SCTP_DEBUG_INDATA1, "Bad size of SACK chunk\n"); break; } offset_seg = *offset + sizeof(struct sctp_sack_chunk); offset_dup = offset_seg + num_seg * sizeof(struct sctp_gap_ack_block); SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_SACK process cum_ack:%x num_seg:%d a_rwnd:%d\n", cum_ack, num_seg, a_rwnd); stcb->asoc.seen_a_sack_this_pkt = 1; if ((stcb->asoc.pr_sctp_cnt == 0) && (num_seg == 0) && SCTP_TSN_GE(cum_ack, stcb->asoc.last_acked_seq) && (stcb->asoc.saw_sack_with_frags == 0) && (stcb->asoc.saw_sack_with_nr_frags == 0) && (!TAILQ_EMPTY(&stcb->asoc.sent_queue)) ) { /* * We have a SIMPLE sack having no * prior segments and data on sent * queue to be acked.. Use the * faster path sack processing. We * also allow window update sacks * with no missing segments to go * this way too. */ sctp_express_handle_sack(stcb, cum_ack, a_rwnd, &abort_now, ecne_seen); } else { if (netp && *netp) sctp_handle_sack(m, offset_seg, offset_dup, stcb, num_seg, 0, num_dup, &abort_now, flags, cum_ack, a_rwnd, ecne_seen); } if (abort_now) { /* ABORT signal from sack processing */ *offset = length; return (NULL); } if (TAILQ_EMPTY(&stcb->asoc.send_queue) && TAILQ_EMPTY(&stcb->asoc.sent_queue) && (stcb->asoc.stream_queue_cnt == 0)) { sctp_ulp_notify(SCTP_NOTIFY_SENDER_DRY, stcb, 0, NULL, SCTP_SO_NOT_LOCKED); } } break; /* * EY - nr_sack: If the received chunk is an * nr_sack chunk */ case SCTP_NR_SELECTIVE_ACK: { struct sctp_nr_sack_chunk *nr_sack; int abort_now = 0; uint32_t a_rwnd, cum_ack; uint16_t num_seg, num_nr_seg, num_dup; uint8_t flags; int offset_seg, offset_dup; SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_NR_SACK\n"); SCTP_STAT_INCR(sctps_recvsacks); if (stcb == NULL) { SCTPDBG(SCTP_DEBUG_INDATA1, "No stcb when processing NR-SACK chunk\n"); break; } if (stcb->asoc.nrsack_supported == 0) { goto unknown_chunk; } if (chk_length < sizeof(struct sctp_nr_sack_chunk)) { SCTPDBG(SCTP_DEBUG_INDATA1, "Bad size on NR-SACK chunk, too small\n"); break; } if (SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_SHUTDOWN_ACK_SENT) { /*- * If we have sent a shutdown-ack, we will pay no * attention to a sack sent in to us since * we don't care anymore. */ break; } nr_sack = (struct sctp_nr_sack_chunk *)ch; flags = ch->chunk_flags; cum_ack = ntohl(nr_sack->nr_sack.cum_tsn_ack); num_seg = ntohs(nr_sack->nr_sack.num_gap_ack_blks); num_nr_seg = ntohs(nr_sack->nr_sack.num_nr_gap_ack_blks); num_dup = ntohs(nr_sack->nr_sack.num_dup_tsns); a_rwnd = (uint32_t)ntohl(nr_sack->nr_sack.a_rwnd); if (sizeof(struct sctp_nr_sack_chunk) + (num_seg + num_nr_seg) * sizeof(struct sctp_gap_ack_block) + num_dup * sizeof(uint32_t) != chk_length) { SCTPDBG(SCTP_DEBUG_INDATA1, "Bad size of NR_SACK chunk\n"); break; } offset_seg = *offset + sizeof(struct sctp_nr_sack_chunk); offset_dup = offset_seg + num_seg * sizeof(struct sctp_gap_ack_block); SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_NR_SACK process cum_ack:%x num_seg:%d a_rwnd:%d\n", cum_ack, num_seg, a_rwnd); stcb->asoc.seen_a_sack_this_pkt = 1; if ((stcb->asoc.pr_sctp_cnt == 0) && (num_seg == 0) && (num_nr_seg == 0) && SCTP_TSN_GE(cum_ack, stcb->asoc.last_acked_seq) && (stcb->asoc.saw_sack_with_frags == 0) && (stcb->asoc.saw_sack_with_nr_frags == 0) && (!TAILQ_EMPTY(&stcb->asoc.sent_queue))) { /* * We have a SIMPLE sack having no * prior segments and data on sent * queue to be acked. Use the faster * path sack processing. We also * allow window update sacks with no * missing segments to go this way * too. */ sctp_express_handle_sack(stcb, cum_ack, a_rwnd, &abort_now, ecne_seen); } else { if (netp && *netp) sctp_handle_sack(m, offset_seg, offset_dup, stcb, num_seg, num_nr_seg, num_dup, &abort_now, flags, cum_ack, a_rwnd, ecne_seen); } if (abort_now) { /* ABORT signal from sack processing */ *offset = length; return (NULL); } if (TAILQ_EMPTY(&stcb->asoc.send_queue) && TAILQ_EMPTY(&stcb->asoc.sent_queue) && (stcb->asoc.stream_queue_cnt == 0)) { sctp_ulp_notify(SCTP_NOTIFY_SENDER_DRY, stcb, 0, NULL, SCTP_SO_NOT_LOCKED); } } break; case SCTP_HEARTBEAT_REQUEST: SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_HEARTBEAT\n"); if ((stcb) && netp && *netp) { SCTP_STAT_INCR(sctps_recvheartbeat); sctp_send_heartbeat_ack(stcb, m, *offset, chk_length, *netp); /* He's alive so give him credit */ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_THRESHOLD_LOGGING) { sctp_misc_ints(SCTP_THRESHOLD_CLEAR, stcb->asoc.overall_error_count, 0, SCTP_FROM_SCTP_INPUT, __LINE__); } stcb->asoc.overall_error_count = 0; } break; case SCTP_HEARTBEAT_ACK: SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_HEARTBEAT-ACK\n"); if ((stcb == NULL) || (chk_length != sizeof(struct sctp_heartbeat_chunk))) { /* Its not ours */ *offset = length; if (locked_tcb) { SCTP_TCB_UNLOCK(locked_tcb); } return (NULL); } /* He's alive so give him credit */ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_THRESHOLD_LOGGING) { sctp_misc_ints(SCTP_THRESHOLD_CLEAR, stcb->asoc.overall_error_count, 0, SCTP_FROM_SCTP_INPUT, __LINE__); } stcb->asoc.overall_error_count = 0; SCTP_STAT_INCR(sctps_recvheartbeatack); if (netp && *netp) sctp_handle_heartbeat_ack((struct sctp_heartbeat_chunk *)ch, stcb, *netp); break; case SCTP_ABORT_ASSOCIATION: SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_ABORT, stcb %p\n", (void *)stcb); if ((stcb) && netp && *netp) sctp_handle_abort((struct sctp_abort_chunk *)ch, stcb, *netp); *offset = length; return (NULL); break; case SCTP_SHUTDOWN: SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_SHUTDOWN, stcb %p\n", (void *)stcb); if ((stcb == NULL) || (chk_length != sizeof(struct sctp_shutdown_chunk))) { *offset = length; if (locked_tcb) { SCTP_TCB_UNLOCK(locked_tcb); } return (NULL); } if (netp && *netp) { int abort_flag = 0; sctp_handle_shutdown((struct sctp_shutdown_chunk *)ch, stcb, *netp, &abort_flag); if (abort_flag) { *offset = length; return (NULL); } } break; case SCTP_SHUTDOWN_ACK: SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_SHUTDOWN-ACK, stcb %p\n", (void *)stcb); if ((stcb) && (netp) && (*netp)) sctp_handle_shutdown_ack((struct sctp_shutdown_ack_chunk *)ch, stcb, *netp); *offset = length; return (NULL); break; case SCTP_OPERATION_ERROR: SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_OP-ERR\n"); if ((stcb) && netp && *netp && sctp_handle_error(ch, stcb, *netp) < 0) { *offset = length; return (NULL); } break; case SCTP_COOKIE_ECHO: SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_COOKIE-ECHO, stcb %p\n", (void *)stcb); if ((stcb) && (stcb->asoc.total_output_queue_size)) { ; } else { if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) { /* We are not interested anymore */ abend: if (stcb) { SCTP_TCB_UNLOCK(stcb); } *offset = length; return (NULL); } } /* * First are we accepting? We do this again here * since it is possible that a previous endpoint WAS * listening responded to a INIT-ACK and then * closed. We opened and bound.. and are now no * longer listening. + * + * XXXGL: notes on checking listen queue length. + * 1) SCTP_IS_LISTENING() doesn't necessarily mean + * SOLISTENING(), because a listening "UDP type" + * socket isn't listening in terms of the socket + * layer. It is a normal data flow socket, that + * can fork off new connections. Thus, we should + * look into sol_qlen only in case we are !UDP. + * 2) Checking sol_qlen in general requires locking + * the socket, and this code lacks that. */ - if ((stcb == NULL) && (!SCTP_IS_LISTENING(inp) || - inp->sctp_socket->so_qlen >= inp->sctp_socket->so_qlimit)) { + (!(inp->sctp_flags & SCTP_PCB_FLAGS_UDPTYPE) && + inp->sctp_socket->sol_qlen >= inp->sctp_socket->sol_qlimit))) { if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) && (SCTP_BASE_SYSCTL(sctp_abort_if_one_2_one_hits_limit))) { op_err = sctp_generate_cause(SCTP_CAUSE_OUT_OF_RESC, ""); sctp_abort_association(inp, stcb, m, iphlen, src, dst, sh, op_err, mflowtype, mflowid, vrf_id, port); } *offset = length; return (NULL); } else { struct mbuf *ret_buf; struct sctp_inpcb *linp; if (stcb) { linp = NULL; } else { linp = inp; } if (linp) { SCTP_ASOC_CREATE_LOCK(linp); if ((inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) || (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE)) { SCTP_ASOC_CREATE_UNLOCK(linp); goto abend; } } if (netp) { ret_buf = sctp_handle_cookie_echo(m, iphlen, *offset, src, dst, sh, (struct sctp_cookie_echo_chunk *)ch, &inp, &stcb, netp, auth_skipped, auth_offset, auth_len, &locked_tcb, mflowtype, mflowid, vrf_id, port); } else { ret_buf = NULL; } if (linp) { SCTP_ASOC_CREATE_UNLOCK(linp); } if (ret_buf == NULL) { if (locked_tcb) { SCTP_TCB_UNLOCK(locked_tcb); } SCTPDBG(SCTP_DEBUG_INPUT3, "GAK, null buffer\n"); *offset = length; return (NULL); } /* if AUTH skipped, see if it verified... */ if (auth_skipped) { got_auth = 1; auth_skipped = 0; } if (!TAILQ_EMPTY(&stcb->asoc.sent_queue)) { /* * Restart the timer if we have * pending data */ struct sctp_tmit_chunk *chk; chk = TAILQ_FIRST(&stcb->asoc.sent_queue); sctp_timer_start(SCTP_TIMER_TYPE_SEND, stcb->sctp_ep, stcb, chk->whoTo); } } break; case SCTP_COOKIE_ACK: SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_COOKIE-ACK, stcb %p\n", (void *)stcb); if ((stcb == NULL) || chk_length != sizeof(struct sctp_cookie_ack_chunk)) { if (locked_tcb) { SCTP_TCB_UNLOCK(locked_tcb); } return (NULL); } if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) { /* We are not interested anymore */ if ((stcb) && (stcb->asoc.total_output_queue_size)) { ; } else if (stcb) { #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) so = SCTP_INP_SO(inp); atomic_add_int(&stcb->asoc.refcnt, 1); SCTP_TCB_UNLOCK(stcb); SCTP_SOCKET_LOCK(so, 1); SCTP_TCB_LOCK(stcb); atomic_subtract_int(&stcb->asoc.refcnt, 1); #endif (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTP_INPUT + SCTP_LOC_30); #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) SCTP_SOCKET_UNLOCK(so, 1); #endif *offset = length; return (NULL); } } /* He's alive so give him credit */ if ((stcb) && netp && *netp) { if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_THRESHOLD_LOGGING) { sctp_misc_ints(SCTP_THRESHOLD_CLEAR, stcb->asoc.overall_error_count, 0, SCTP_FROM_SCTP_INPUT, __LINE__); } stcb->asoc.overall_error_count = 0; sctp_handle_cookie_ack((struct sctp_cookie_ack_chunk *)ch, stcb, *netp); } break; case SCTP_ECN_ECHO: SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_ECN-ECHO\n"); /* He's alive so give him credit */ if ((stcb == NULL) || (chk_length != sizeof(struct sctp_ecne_chunk))) { /* Its not ours */ if (locked_tcb) { SCTP_TCB_UNLOCK(locked_tcb); } *offset = length; return (NULL); } if (stcb) { if (stcb->asoc.ecn_supported == 0) { goto unknown_chunk; } if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_THRESHOLD_LOGGING) { sctp_misc_ints(SCTP_THRESHOLD_CLEAR, stcb->asoc.overall_error_count, 0, SCTP_FROM_SCTP_INPUT, __LINE__); } stcb->asoc.overall_error_count = 0; sctp_handle_ecn_echo((struct sctp_ecne_chunk *)ch, stcb); ecne_seen = 1; } break; case SCTP_ECN_CWR: SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_ECN-CWR\n"); /* He's alive so give him credit */ if ((stcb == NULL) || (chk_length != sizeof(struct sctp_cwr_chunk))) { /* Its not ours */ if (locked_tcb) { SCTP_TCB_UNLOCK(locked_tcb); } *offset = length; return (NULL); } if (stcb) { if (stcb->asoc.ecn_supported == 0) { goto unknown_chunk; } if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_THRESHOLD_LOGGING) { sctp_misc_ints(SCTP_THRESHOLD_CLEAR, stcb->asoc.overall_error_count, 0, SCTP_FROM_SCTP_INPUT, __LINE__); } stcb->asoc.overall_error_count = 0; sctp_handle_ecn_cwr((struct sctp_cwr_chunk *)ch, stcb, *netp); } break; case SCTP_SHUTDOWN_COMPLETE: SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_SHUTDOWN-COMPLETE, stcb %p\n", (void *)stcb); /* must be first and only chunk */ if ((num_chunks > 1) || (length - *offset > (int)SCTP_SIZE32(chk_length))) { *offset = length; if (locked_tcb) { SCTP_TCB_UNLOCK(locked_tcb); } return (NULL); } if ((stcb) && netp && *netp) { sctp_handle_shutdown_complete((struct sctp_shutdown_complete_chunk *)ch, stcb, *netp); } *offset = length; return (NULL); break; case SCTP_ASCONF: SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_ASCONF\n"); /* He's alive so give him credit */ if (stcb) { if (stcb->asoc.asconf_supported == 0) { goto unknown_chunk; } if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_THRESHOLD_LOGGING) { sctp_misc_ints(SCTP_THRESHOLD_CLEAR, stcb->asoc.overall_error_count, 0, SCTP_FROM_SCTP_INPUT, __LINE__); } stcb->asoc.overall_error_count = 0; sctp_handle_asconf(m, *offset, src, (struct sctp_asconf_chunk *)ch, stcb, asconf_cnt == 0); asconf_cnt++; } break; case SCTP_ASCONF_ACK: SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_ASCONF-ACK\n"); if (chk_length < sizeof(struct sctp_asconf_ack_chunk)) { /* Its not ours */ if (locked_tcb) { SCTP_TCB_UNLOCK(locked_tcb); } *offset = length; return (NULL); } if ((stcb) && netp && *netp) { if (stcb->asoc.asconf_supported == 0) { goto unknown_chunk; } /* He's alive so give him credit */ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_THRESHOLD_LOGGING) { sctp_misc_ints(SCTP_THRESHOLD_CLEAR, stcb->asoc.overall_error_count, 0, SCTP_FROM_SCTP_INPUT, __LINE__); } stcb->asoc.overall_error_count = 0; sctp_handle_asconf_ack(m, *offset, (struct sctp_asconf_ack_chunk *)ch, stcb, *netp, &abort_no_unlock); if (abort_no_unlock) return (NULL); } break; case SCTP_FORWARD_CUM_TSN: case SCTP_IFORWARD_CUM_TSN: SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_FWD-TSN\n"); if (chk_length < sizeof(struct sctp_forward_tsn_chunk)) { /* Its not ours */ if (locked_tcb) { SCTP_TCB_UNLOCK(locked_tcb); } *offset = length; return (NULL); } /* He's alive so give him credit */ if (stcb) { int abort_flag = 0; if (stcb->asoc.prsctp_supported == 0) { goto unknown_chunk; } stcb->asoc.overall_error_count = 0; if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_THRESHOLD_LOGGING) { sctp_misc_ints(SCTP_THRESHOLD_CLEAR, stcb->asoc.overall_error_count, 0, SCTP_FROM_SCTP_INPUT, __LINE__); } *fwd_tsn_seen = 1; if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) { /* We are not interested anymore */ #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) so = SCTP_INP_SO(inp); atomic_add_int(&stcb->asoc.refcnt, 1); SCTP_TCB_UNLOCK(stcb); SCTP_SOCKET_LOCK(so, 1); SCTP_TCB_LOCK(stcb); atomic_subtract_int(&stcb->asoc.refcnt, 1); #endif (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTP_INPUT + SCTP_LOC_31); #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) SCTP_SOCKET_UNLOCK(so, 1); #endif *offset = length; return (NULL); } /* * For sending a SACK this looks like DATA * chunks. */ stcb->asoc.last_data_chunk_from = stcb->asoc.last_control_chunk_from; sctp_handle_forward_tsn(stcb, (struct sctp_forward_tsn_chunk *)ch, &abort_flag, m, *offset); if (abort_flag) { *offset = length; return (NULL); } else { if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_THRESHOLD_LOGGING) { sctp_misc_ints(SCTP_THRESHOLD_CLEAR, stcb->asoc.overall_error_count, 0, SCTP_FROM_SCTP_INPUT, __LINE__); } stcb->asoc.overall_error_count = 0; } } break; case SCTP_STREAM_RESET: SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_STREAM_RESET\n"); if (((stcb == NULL) || (ch == NULL) || (chk_length < sizeof(struct sctp_stream_reset_tsn_req)))) { /* Its not ours */ if (locked_tcb) { SCTP_TCB_UNLOCK(locked_tcb); } *offset = length; return (NULL); } if (stcb->asoc.reconfig_supported == 0) { goto unknown_chunk; } if (sctp_handle_stream_reset(stcb, m, *offset, ch)) { /* stop processing */ *offset = length; return (NULL); } break; case SCTP_PACKET_DROPPED: SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_PACKET_DROPPED\n"); /* re-get it all please */ if (chk_length < sizeof(struct sctp_pktdrop_chunk)) { /* Its not ours */ if (locked_tcb) { SCTP_TCB_UNLOCK(locked_tcb); } *offset = length; return (NULL); } if (ch && (stcb) && netp && (*netp)) { if (stcb->asoc.pktdrop_supported == 0) { goto unknown_chunk; } sctp_handle_packet_dropped((struct sctp_pktdrop_chunk *)ch, stcb, *netp, min(chk_length, (sizeof(chunk_buf) - 4))); } break; case SCTP_AUTHENTICATION: SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_AUTHENTICATION\n"); if (stcb == NULL) { /* save the first AUTH for later processing */ if (auth_skipped == 0) { auth_offset = *offset; auth_len = chk_length; auth_skipped = 1; } /* skip this chunk (temporarily) */ goto next_chunk; } if (stcb->asoc.auth_supported == 0) { goto unknown_chunk; } if ((chk_length < (sizeof(struct sctp_auth_chunk))) || (chk_length > (sizeof(struct sctp_auth_chunk) + SCTP_AUTH_DIGEST_LEN_MAX))) { /* Its not ours */ if (locked_tcb) { SCTP_TCB_UNLOCK(locked_tcb); } *offset = length; return (NULL); } if (got_auth == 1) { /* skip this chunk... it's already auth'd */ goto next_chunk; } got_auth = 1; if ((ch == NULL) || sctp_handle_auth(stcb, (struct sctp_auth_chunk *)ch, m, *offset)) { /* auth HMAC failed so dump the packet */ *offset = length; return (stcb); } else { /* remaining chunks are HMAC checked */ stcb->asoc.authenticated = 1; } break; default: unknown_chunk: /* it's an unknown chunk! */ if ((ch->chunk_type & 0x40) && (stcb != NULL)) { struct sctp_gen_error_cause *cause; int len; op_err = sctp_get_mbuf_for_msg(sizeof(struct sctp_gen_error_cause), 0, M_NOWAIT, 1, MT_DATA); if (op_err != NULL) { len = min(SCTP_SIZE32(chk_length), (uint32_t)(length - *offset)); cause = mtod(op_err, struct sctp_gen_error_cause *); cause->code = htons(SCTP_CAUSE_UNRECOG_CHUNK); cause->length = htons((uint16_t)(len + sizeof(struct sctp_gen_error_cause))); SCTP_BUF_LEN(op_err) = sizeof(struct sctp_gen_error_cause); SCTP_BUF_NEXT(op_err) = SCTP_M_COPYM(m, *offset, len, M_NOWAIT); if (SCTP_BUF_NEXT(op_err) != NULL) { #ifdef SCTP_MBUF_LOGGING if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MBUF_LOGGING_ENABLE) { sctp_log_mbc(SCTP_BUF_NEXT(op_err), SCTP_MBUF_ICOPY); } #endif sctp_queue_op_err(stcb, op_err); } else { sctp_m_freem(op_err); } } } if ((ch->chunk_type & 0x80) == 0) { /* discard this packet */ *offset = length; return (stcb); } /* else skip this bad chunk and continue... */ break; } /* switch (ch->chunk_type) */ next_chunk: /* get the next chunk */ *offset += SCTP_SIZE32(chk_length); if (*offset >= length) { /* no more data left in the mbuf chain */ break; } ch = (struct sctp_chunkhdr *)sctp_m_getptr(m, *offset, sizeof(struct sctp_chunkhdr), chunk_buf); if (ch == NULL) { if (locked_tcb) { SCTP_TCB_UNLOCK(locked_tcb); } *offset = length; return (NULL); } } /* while */ if (asconf_cnt > 0 && stcb != NULL) { sctp_send_asconf_ack(stcb); } return (stcb); } /* * common input chunk processing (v4 and v6) */ void sctp_common_input_processing(struct mbuf **mm, int iphlen, int offset, int length, struct sockaddr *src, struct sockaddr *dst, struct sctphdr *sh, struct sctp_chunkhdr *ch, #if !defined(SCTP_WITH_NO_CSUM) uint8_t compute_crc, #endif uint8_t ecn_bits, uint8_t mflowtype, uint32_t mflowid, uint16_t fibnum, uint32_t vrf_id, uint16_t port) { uint32_t high_tsn; int fwd_tsn_seen = 0, data_processed = 0; struct mbuf *m = *mm, *op_err; char msg[SCTP_DIAG_INFO_LEN]; int un_sent; int cnt_ctrl_ready = 0; struct sctp_inpcb *inp = NULL, *inp_decr = NULL; struct sctp_tcb *stcb = NULL; struct sctp_nets *net = NULL; SCTP_STAT_INCR(sctps_recvdatagrams); #ifdef SCTP_AUDITING_ENABLED sctp_audit_log(0xE0, 1); sctp_auditing(0, inp, stcb, net); #endif #if !defined(SCTP_WITH_NO_CSUM) if (compute_crc != 0) { uint32_t check, calc_check; check = sh->checksum; sh->checksum = 0; calc_check = sctp_calculate_cksum(m, iphlen); sh->checksum = check; if (calc_check != check) { SCTPDBG(SCTP_DEBUG_INPUT1, "Bad CSUM on SCTP packet calc_check:%x check:%x m:%p mlen:%d iphlen:%d\n", calc_check, check, (void *)m, length, iphlen); stcb = sctp_findassociation_addr(m, offset, src, dst, sh, ch, &inp, &net, vrf_id); #if defined(INET) || defined(INET6) if ((ch->chunk_type != SCTP_INITIATION) && (net != NULL) && (net->port != port)) { if (net->port == 0) { /* UDP encapsulation turned on. */ net->mtu -= sizeof(struct udphdr); if (stcb->asoc.smallest_mtu > net->mtu) { sctp_pathmtu_adjustment(stcb, net->mtu); } } else if (port == 0) { /* UDP encapsulation turned off. */ net->mtu += sizeof(struct udphdr); /* XXX Update smallest_mtu */ } net->port = port; } #endif if (net != NULL) { net->flowtype = mflowtype; net->flowid = mflowid; } if ((inp != NULL) && (stcb != NULL)) { sctp_send_packet_dropped(stcb, net, m, length, iphlen, 1); sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_INPUT_ERROR, SCTP_SO_NOT_LOCKED); } else if ((inp != NULL) && (stcb == NULL)) { inp_decr = inp; } SCTP_STAT_INCR(sctps_badsum); SCTP_STAT_INCR_COUNTER32(sctps_checksumerrors); goto out; } } #endif /* Destination port of 0 is illegal, based on RFC4960. */ if (sh->dest_port == 0) { SCTP_STAT_INCR(sctps_hdrops); goto out; } stcb = sctp_findassociation_addr(m, offset, src, dst, sh, ch, &inp, &net, vrf_id); #if defined(INET) || defined(INET6) if ((ch->chunk_type != SCTP_INITIATION) && (net != NULL) && (net->port != port)) { if (net->port == 0) { /* UDP encapsulation turned on. */ net->mtu -= sizeof(struct udphdr); if (stcb->asoc.smallest_mtu > net->mtu) { sctp_pathmtu_adjustment(stcb, net->mtu); } } else if (port == 0) { /* UDP encapsulation turned off. */ net->mtu += sizeof(struct udphdr); /* XXX Update smallest_mtu */ } net->port = port; } #endif if (net != NULL) { net->flowtype = mflowtype; net->flowid = mflowid; } if (inp == NULL) { SCTP_STAT_INCR(sctps_noport); if (badport_bandlim(BANDLIM_SCTP_OOTB) < 0) { goto out; } if (ch->chunk_type == SCTP_SHUTDOWN_ACK) { sctp_send_shutdown_complete2(src, dst, sh, mflowtype, mflowid, fibnum, vrf_id, port); goto out; } if (ch->chunk_type == SCTP_SHUTDOWN_COMPLETE) { goto out; } if (ch->chunk_type != SCTP_ABORT_ASSOCIATION) { if ((SCTP_BASE_SYSCTL(sctp_blackhole) == 0) || ((SCTP_BASE_SYSCTL(sctp_blackhole) == 1) && (ch->chunk_type != SCTP_INIT))) { op_err = sctp_generate_cause(SCTP_BASE_SYSCTL(sctp_diag_info_code), "Out of the blue"); sctp_send_abort(m, iphlen, src, dst, sh, 0, op_err, mflowtype, mflowid, fibnum, vrf_id, port); } } goto out; } else if (stcb == NULL) { inp_decr = inp; } SCTPDBG(SCTP_DEBUG_INPUT1, "Ok, Common input processing called, m:%p iphlen:%d offset:%d length:%d stcb:%p\n", (void *)m, iphlen, offset, length, (void *)stcb); if (stcb) { /* always clear this before beginning a packet */ stcb->asoc.authenticated = 0; stcb->asoc.seen_a_sack_this_pkt = 0; SCTPDBG(SCTP_DEBUG_INPUT1, "stcb:%p state:%x\n", (void *)stcb, stcb->asoc.state); if ((stcb->asoc.state & SCTP_STATE_WAS_ABORTED) || (stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED)) { /*- * If we hit here, we had a ref count * up when the assoc was aborted and the * timer is clearing out the assoc, we should * NOT respond to any packet.. its OOTB. */ SCTP_TCB_UNLOCK(stcb); stcb = NULL; snprintf(msg, sizeof(msg), "OOTB, %s:%d at %s", __FILE__, __LINE__, __func__); op_err = sctp_generate_cause(SCTP_BASE_SYSCTL(sctp_diag_info_code), msg); sctp_handle_ootb(m, iphlen, offset, src, dst, sh, inp, op_err, mflowtype, mflowid, inp->fibnum, vrf_id, port); goto out; } } if (IS_SCTP_CONTROL(ch)) { /* process the control portion of the SCTP packet */ /* sa_ignore NO_NULL_CHK */ stcb = sctp_process_control(m, iphlen, &offset, length, src, dst, sh, ch, inp, stcb, &net, &fwd_tsn_seen, mflowtype, mflowid, fibnum, vrf_id, port); if (stcb) { /* * This covers us if the cookie-echo was there and * it changes our INP. */ inp = stcb->sctp_ep; #if defined(INET) || defined(INET6) if ((ch->chunk_type != SCTP_INITIATION) && (net != NULL) && (net->port != port)) { if (net->port == 0) { /* UDP encapsulation turned on. */ net->mtu -= sizeof(struct udphdr); if (stcb->asoc.smallest_mtu > net->mtu) { sctp_pathmtu_adjustment(stcb, net->mtu); } } else if (port == 0) { /* UDP encapsulation turned off. */ net->mtu += sizeof(struct udphdr); /* XXX Update smallest_mtu */ } net->port = port; } #endif } } else { /* * no control chunks, so pre-process DATA chunks (these * checks are taken care of by control processing) */ /* * if DATA only packet, and auth is required, then punt... * can't have authenticated without any AUTH (control) * chunks */ if ((stcb != NULL) && (stcb->asoc.auth_supported == 1) && sctp_auth_is_required_chunk(SCTP_DATA, stcb->asoc.local_auth_chunks)) { /* "silently" ignore */ SCTP_STAT_INCR(sctps_recvauthmissing); goto out; } if (stcb == NULL) { /* out of the blue DATA chunk */ snprintf(msg, sizeof(msg), "OOTB, %s:%d at %s", __FILE__, __LINE__, __func__); op_err = sctp_generate_cause(SCTP_BASE_SYSCTL(sctp_diag_info_code), msg); sctp_handle_ootb(m, iphlen, offset, src, dst, sh, inp, op_err, mflowtype, mflowid, fibnum, vrf_id, port); goto out; } if (stcb->asoc.my_vtag != ntohl(sh->v_tag)) { /* v_tag mismatch! */ SCTP_STAT_INCR(sctps_badvtag); goto out; } } if (stcb == NULL) { /* * no valid TCB for this packet, or we found it's a bad * packet while processing control, or we're done with this * packet (done or skip rest of data), so we drop it... */ goto out; } /* * DATA chunk processing */ /* plow through the data chunks while length > offset */ /* * Rest should be DATA only. Check authentication state if AUTH for * DATA is required. */ if ((length > offset) && (stcb != NULL) && (stcb->asoc.auth_supported == 1) && sctp_auth_is_required_chunk(SCTP_DATA, stcb->asoc.local_auth_chunks) && !stcb->asoc.authenticated) { /* "silently" ignore */ SCTP_STAT_INCR(sctps_recvauthmissing); SCTPDBG(SCTP_DEBUG_AUTH1, "Data chunk requires AUTH, skipped\n"); goto trigger_send; } if (length > offset) { int retval; /* * First check to make sure our state is correct. We would * not get here unless we really did have a tag, so we don't * abort if this happens, just dump the chunk silently. */ switch (SCTP_GET_STATE(&stcb->asoc)) { case SCTP_STATE_COOKIE_ECHOED: /* * we consider data with valid tags in this state * shows us the cookie-ack was lost. Imply it was * there. */ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_THRESHOLD_LOGGING) { sctp_misc_ints(SCTP_THRESHOLD_CLEAR, stcb->asoc.overall_error_count, 0, SCTP_FROM_SCTP_INPUT, __LINE__); } stcb->asoc.overall_error_count = 0; sctp_handle_cookie_ack((struct sctp_cookie_ack_chunk *)ch, stcb, net); break; case SCTP_STATE_COOKIE_WAIT: /* * We consider OOTB any data sent during asoc setup. */ snprintf(msg, sizeof(msg), "OOTB, %s:%d at %s", __FILE__, __LINE__, __func__); op_err = sctp_generate_cause(SCTP_BASE_SYSCTL(sctp_diag_info_code), msg); sctp_handle_ootb(m, iphlen, offset, src, dst, sh, inp, op_err, mflowtype, mflowid, inp->fibnum, vrf_id, port); goto out; /* sa_ignore NOTREACHED */ break; case SCTP_STATE_EMPTY: /* should not happen */ case SCTP_STATE_INUSE: /* should not happen */ case SCTP_STATE_SHUTDOWN_RECEIVED: /* This is a peer error */ case SCTP_STATE_SHUTDOWN_ACK_SENT: default: goto out; /* sa_ignore NOTREACHED */ break; case SCTP_STATE_OPEN: case SCTP_STATE_SHUTDOWN_SENT: break; } /* plow through the data chunks while length > offset */ retval = sctp_process_data(mm, iphlen, &offset, length, inp, stcb, net, &high_tsn); if (retval == 2) { /* * The association aborted, NO UNLOCK needed since * the association is destroyed. */ stcb = NULL; goto out; } data_processed = 1; /* * Anything important needs to have been m_copy'ed in * process_data */ } /* take care of ecn */ if ((data_processed == 1) && (stcb->asoc.ecn_supported == 1) && ((ecn_bits & SCTP_CE_BITS) == SCTP_CE_BITS)) { /* Yep, we need to add a ECNE */ sctp_send_ecn_echo(stcb, net, high_tsn); } if ((data_processed == 0) && (fwd_tsn_seen)) { int was_a_gap; uint32_t highest_tsn; if (SCTP_TSN_GT(stcb->asoc.highest_tsn_inside_nr_map, stcb->asoc.highest_tsn_inside_map)) { highest_tsn = stcb->asoc.highest_tsn_inside_nr_map; } else { highest_tsn = stcb->asoc.highest_tsn_inside_map; } was_a_gap = SCTP_TSN_GT(highest_tsn, stcb->asoc.cumulative_tsn); stcb->asoc.send_sack = 1; sctp_sack_check(stcb, was_a_gap); } else if (fwd_tsn_seen) { stcb->asoc.send_sack = 1; } /* trigger send of any chunks in queue... */ trigger_send: #ifdef SCTP_AUDITING_ENABLED sctp_audit_log(0xE0, 2); sctp_auditing(1, inp, stcb, net); #endif SCTPDBG(SCTP_DEBUG_INPUT1, "Check for chunk output prw:%d tqe:%d tf=%d\n", stcb->asoc.peers_rwnd, TAILQ_EMPTY(&stcb->asoc.control_send_queue), stcb->asoc.total_flight); un_sent = (stcb->asoc.total_output_queue_size - stcb->asoc.total_flight); if (!TAILQ_EMPTY(&stcb->asoc.control_send_queue)) { cnt_ctrl_ready = stcb->asoc.ctrl_queue_cnt - stcb->asoc.ecn_echo_cnt_onq; } if (!TAILQ_EMPTY(&stcb->asoc.asconf_send_queue) || cnt_ctrl_ready || stcb->asoc.trigger_reset || ((un_sent) && (stcb->asoc.peers_rwnd > 0 || (stcb->asoc.peers_rwnd <= 0 && stcb->asoc.total_flight == 0)))) { SCTPDBG(SCTP_DEBUG_INPUT3, "Calling chunk OUTPUT\n"); sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_CONTROL_PROC, SCTP_SO_NOT_LOCKED); SCTPDBG(SCTP_DEBUG_INPUT3, "chunk OUTPUT returns\n"); } #ifdef SCTP_AUDITING_ENABLED sctp_audit_log(0xE0, 3); sctp_auditing(2, inp, stcb, net); #endif out: if (stcb != NULL) { SCTP_TCB_UNLOCK(stcb); } if (inp_decr != NULL) { /* reduce ref-count */ SCTP_INP_WLOCK(inp_decr); SCTP_INP_DECR_REF(inp_decr); SCTP_INP_WUNLOCK(inp_decr); } return; } #ifdef INET void sctp_input_with_port(struct mbuf *i_pak, int off, uint16_t port) { struct mbuf *m; int iphlen; uint32_t vrf_id = 0; uint8_t ecn_bits; struct sockaddr_in src, dst; struct ip *ip; struct sctphdr *sh; struct sctp_chunkhdr *ch; int length, offset; #if !defined(SCTP_WITH_NO_CSUM) uint8_t compute_crc; #endif uint32_t mflowid; uint8_t mflowtype; uint16_t fibnum; iphlen = off; if (SCTP_GET_PKT_VRFID(i_pak, vrf_id)) { SCTP_RELEASE_PKT(i_pak); return; } m = SCTP_HEADER_TO_CHAIN(i_pak); #ifdef SCTP_MBUF_LOGGING /* Log in any input mbufs */ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MBUF_LOGGING_ENABLE) { sctp_log_mbc(m, SCTP_MBUF_INPUT); } #endif #ifdef SCTP_PACKET_LOGGING if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LAST_PACKET_TRACING) { sctp_packet_log(m); } #endif SCTPDBG(SCTP_DEBUG_CRCOFFLOAD, "sctp_input(): Packet of length %d received on %s with csum_flags 0x%b.\n", m->m_pkthdr.len, if_name(m->m_pkthdr.rcvif), (int)m->m_pkthdr.csum_flags, CSUM_BITS); mflowid = m->m_pkthdr.flowid; mflowtype = M_HASHTYPE_GET(m); fibnum = M_GETFIB(m); SCTP_STAT_INCR(sctps_recvpackets); SCTP_STAT_INCR_COUNTER64(sctps_inpackets); /* Get IP, SCTP, and first chunk header together in the first mbuf. */ offset = iphlen + sizeof(struct sctphdr) + sizeof(struct sctp_chunkhdr); if (SCTP_BUF_LEN(m) < offset) { if ((m = m_pullup(m, offset)) == NULL) { SCTP_STAT_INCR(sctps_hdrops); return; } } ip = mtod(m, struct ip *); sh = (struct sctphdr *)((caddr_t)ip + iphlen); ch = (struct sctp_chunkhdr *)((caddr_t)sh + sizeof(struct sctphdr)); offset -= sizeof(struct sctp_chunkhdr); memset(&src, 0, sizeof(struct sockaddr_in)); src.sin_family = AF_INET; src.sin_len = sizeof(struct sockaddr_in); src.sin_port = sh->src_port; src.sin_addr = ip->ip_src; memset(&dst, 0, sizeof(struct sockaddr_in)); dst.sin_family = AF_INET; dst.sin_len = sizeof(struct sockaddr_in); dst.sin_port = sh->dest_port; dst.sin_addr = ip->ip_dst; length = ntohs(ip->ip_len); /* Validate mbuf chain length with IP payload length. */ if (SCTP_HEADER_LEN(m) != length) { SCTPDBG(SCTP_DEBUG_INPUT1, "sctp_input() length:%d reported length:%d\n", length, SCTP_HEADER_LEN(m)); SCTP_STAT_INCR(sctps_hdrops); goto out; } /* SCTP does not allow broadcasts or multicasts */ if (IN_MULTICAST(ntohl(dst.sin_addr.s_addr))) { goto out; } if (SCTP_IS_IT_BROADCAST(dst.sin_addr, m)) { goto out; } ecn_bits = ip->ip_tos; #if defined(SCTP_WITH_NO_CSUM) SCTP_STAT_INCR(sctps_recvnocrc); #else if (m->m_pkthdr.csum_flags & CSUM_SCTP_VALID) { SCTP_STAT_INCR(sctps_recvhwcrc); compute_crc = 0; } else { SCTP_STAT_INCR(sctps_recvswcrc); compute_crc = 1; } #endif sctp_common_input_processing(&m, iphlen, offset, length, (struct sockaddr *)&src, (struct sockaddr *)&dst, sh, ch, #if !defined(SCTP_WITH_NO_CSUM) compute_crc, #endif ecn_bits, mflowtype, mflowid, fibnum, vrf_id, port); out: if (m) { sctp_m_freem(m); } return; } #if defined(__FreeBSD__) && defined(SCTP_MCORE_INPUT) && defined(SMP) extern int *sctp_cpuarry; #endif int sctp_input(struct mbuf **mp, int *offp, int proto SCTP_UNUSED) { struct mbuf *m; int off; m = *mp; off = *offp; #if defined(__FreeBSD__) && defined(SCTP_MCORE_INPUT) && defined(SMP) if (mp_ncpus > 1) { struct ip *ip; struct sctphdr *sh; int offset; int cpu_to_use; uint32_t flowid, tag; if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) { flowid = m->m_pkthdr.flowid; } else { /* * No flow id built by lower layers fix it so we * create one. */ offset = off + sizeof(struct sctphdr); if (SCTP_BUF_LEN(m) < offset) { if ((m = m_pullup(m, offset)) == NULL) { SCTP_STAT_INCR(sctps_hdrops); return (IPPROTO_DONE); } } ip = mtod(m, struct ip *); sh = (struct sctphdr *)((caddr_t)ip + off); tag = htonl(sh->v_tag); flowid = tag ^ ntohs(sh->dest_port) ^ ntohs(sh->src_port); m->m_pkthdr.flowid = flowid; M_HASHTYPE_SET(m, M_HASHTYPE_OPAQUE_HASH); } cpu_to_use = sctp_cpuarry[flowid % mp_ncpus]; sctp_queue_to_mcore(m, off, cpu_to_use); return (IPPROTO_DONE); } #endif sctp_input_with_port(m, off, 0); return (IPPROTO_DONE); } #endif Index: head/sys/netinet/sctp_syscalls.c =================================================================== --- head/sys/netinet/sctp_syscalls.c (revision 319721) +++ head/sys/netinet/sctp_syscalls.c (revision 319722) @@ -1,597 +1,579 @@ /*- * Copyright (c) 1982, 1986, 1989, 1990, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ #include __FBSDID("$FreeBSD$"); #include "opt_capsicum.h" #include "opt_inet.h" #include "opt_inet6.h" #include "opt_sctp.h" #include "opt_compat.h" #include "opt_ktrace.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KTRACE #include #endif #ifdef COMPAT_FREEBSD32 #include #endif #include #include #include #include #include static struct syscall_helper_data sctp_syscalls[] = { SYSCALL_INIT_HELPER(sctp_peeloff), SYSCALL_INIT_HELPER(sctp_generic_sendmsg), SYSCALL_INIT_HELPER(sctp_generic_sendmsg_iov), SYSCALL_INIT_HELPER(sctp_generic_recvmsg), SYSCALL_INIT_LAST }; static void sctp_syscalls_init(void *unused __unused) { int error; error = syscall_helper_register(sctp_syscalls, SY_THR_STATIC); KASSERT((error == 0), ("%s: syscall_helper_register failed for sctp syscalls", __func__)); #ifdef COMPAT_FREEBSD32 error = syscall32_helper_register(sctp_syscalls, SY_THR_STATIC); KASSERT((error == 0), ("%s: syscall32_helper_register failed for sctp syscalls", __func__)); #endif } SYSINIT(sctp_syscalls, SI_SUB_SYSCALLS, SI_ORDER_ANY, sctp_syscalls_init, NULL); /* * SCTP syscalls. * Functionality only compiled in if SCTP is defined in the kernel Makefile, * otherwise all return EOPNOTSUPP. * XXX: We should make this loadable one day. */ int sys_sctp_peeloff(td, uap) struct thread *td; struct sctp_peeloff_args /* { int sd; caddr_t name; } */ *uap; { #if (defined(INET) || defined(INET6)) && defined(SCTP) struct file *headfp, *nfp = NULL; struct socket *head, *so; cap_rights_t rights; u_int fflag; int error, fd; AUDIT_ARG_FD(uap->sd); error = getsock_cap(td, uap->sd, cap_rights_init(&rights, CAP_PEELOFF), &headfp, &fflag, NULL); if (error != 0) goto done2; head = headfp->f_data; if (head->so_proto->pr_protocol != IPPROTO_SCTP) { error = EOPNOTSUPP; goto done; } error = sctp_can_peel_off(head, (sctp_assoc_t)uap->name); if (error != 0) goto done; /* * At this point we know we do have a assoc to pull * we proceed to get the fd setup. This may block * but that is ok. */ error = falloc(td, &nfp, &fd, 0); if (error != 0) goto done; td->td_retval[0] = fd; CURVNET_SET(head->so_vnet); - so = sonewconn(head, SS_ISCONNECTED); + so = sopeeloff(head); if (so == NULL) { error = ENOMEM; goto noconnection; } - /* - * Before changing the flags on the socket, we have to bump the - * reference count. Otherwise, if the protocol calls sofree(), - * the socket will be released due to a zero refcount. - */ - SOCK_LOCK(so); - soref(so); /* file descriptor reference */ - SOCK_UNLOCK(so); - - ACCEPT_LOCK(); - - TAILQ_REMOVE(&head->so_comp, so, so_list); - head->so_qlen--; - so->so_state |= (head->so_state & SS_NBIO); - so->so_state &= ~SS_NOFDREF; - so->so_qstate &= ~SQ_COMP; - so->so_head = NULL; - ACCEPT_UNLOCK(); finit(nfp, fflag, DTYPE_SOCKET, so, &socketops); error = sctp_do_peeloff(head, so, (sctp_assoc_t)uap->name); if (error != 0) goto noconnection; if (head->so_sigio != NULL) fsetown(fgetown(&head->so_sigio), &so->so_sigio); noconnection: /* * close the new descriptor, assuming someone hasn't ripped it * out from under us. */ if (error != 0) fdclose(td, nfp, fd); /* * Release explicitly held references before returning. */ CURVNET_RESTORE(); done: if (nfp != NULL) fdrop(nfp, td); fdrop(headfp, td); done2: return (error); #else /* SCTP */ return (EOPNOTSUPP); #endif /* SCTP */ } int sys_sctp_generic_sendmsg (td, uap) struct thread *td; struct sctp_generic_sendmsg_args /* { int sd, caddr_t msg, int mlen, caddr_t to, __socklen_t tolen, struct sctp_sndrcvinfo *sinfo, int flags } */ *uap; { #if (defined(INET) || defined(INET6)) && defined(SCTP) struct sctp_sndrcvinfo sinfo, *u_sinfo = NULL; struct socket *so; struct file *fp = NULL; struct sockaddr *to = NULL; #ifdef KTRACE struct uio *ktruio = NULL; #endif struct uio auio; struct iovec iov[1]; cap_rights_t rights; int error = 0, len; if (uap->sinfo != NULL) { error = copyin(uap->sinfo, &sinfo, sizeof (sinfo)); if (error != 0) return (error); u_sinfo = &sinfo; } cap_rights_init(&rights, CAP_SEND); if (uap->tolen != 0) { error = getsockaddr(&to, uap->to, uap->tolen); if (error != 0) { to = NULL; goto sctp_bad2; } cap_rights_set(&rights, CAP_CONNECT); } AUDIT_ARG_FD(uap->sd); error = getsock_cap(td, uap->sd, &rights, &fp, NULL, NULL); if (error != 0) goto sctp_bad; #ifdef KTRACE if (to && (KTRPOINT(td, KTR_STRUCT))) ktrsockaddr(to); #endif iov[0].iov_base = uap->msg; iov[0].iov_len = uap->mlen; so = (struct socket *)fp->f_data; if (so->so_proto->pr_protocol != IPPROTO_SCTP) { error = EOPNOTSUPP; goto sctp_bad; } #ifdef MAC error = mac_socket_check_send(td->td_ucred, so); if (error != 0) goto sctp_bad; #endif /* MAC */ auio.uio_iov = iov; auio.uio_iovcnt = 1; auio.uio_segflg = UIO_USERSPACE; auio.uio_rw = UIO_WRITE; auio.uio_td = td; auio.uio_offset = 0; /* XXX */ auio.uio_resid = 0; #ifdef KTRACE if (KTRPOINT(td, KTR_GENIO)) ktruio = cloneuio(&auio); #endif /* KTRACE */ len = auio.uio_resid = uap->mlen; CURVNET_SET(so->so_vnet); error = sctp_lower_sosend(so, to, &auio, (struct mbuf *)NULL, (struct mbuf *)NULL, uap->flags, u_sinfo, td); CURVNET_RESTORE(); if (error != 0) { if (auio.uio_resid != len && (error == ERESTART || error == EINTR || error == EWOULDBLOCK)) error = 0; /* Generation of SIGPIPE can be controlled per socket. */ if (error == EPIPE && !(so->so_options & SO_NOSIGPIPE) && !(uap->flags & MSG_NOSIGNAL)) { PROC_LOCK(td->td_proc); tdsignal(td, SIGPIPE); PROC_UNLOCK(td->td_proc); } } if (error == 0) td->td_retval[0] = len - auio.uio_resid; #ifdef KTRACE if (ktruio != NULL) { ktruio->uio_resid = td->td_retval[0]; ktrgenio(uap->sd, UIO_WRITE, ktruio, error); } #endif /* KTRACE */ sctp_bad: if (fp != NULL) fdrop(fp, td); sctp_bad2: free(to, M_SONAME); return (error); #else /* SCTP */ return (EOPNOTSUPP); #endif /* SCTP */ } int sys_sctp_generic_sendmsg_iov(td, uap) struct thread *td; struct sctp_generic_sendmsg_iov_args /* { int sd, struct iovec *iov, int iovlen, caddr_t to, __socklen_t tolen, struct sctp_sndrcvinfo *sinfo, int flags } */ *uap; { #if (defined(INET) || defined(INET6)) && defined(SCTP) struct sctp_sndrcvinfo sinfo, *u_sinfo = NULL; struct socket *so; struct file *fp = NULL; struct sockaddr *to = NULL; #ifdef KTRACE struct uio *ktruio = NULL; #endif struct uio auio; struct iovec *iov, *tiov; cap_rights_t rights; ssize_t len; int error, i; if (uap->sinfo != NULL) { error = copyin(uap->sinfo, &sinfo, sizeof (sinfo)); if (error != 0) return (error); u_sinfo = &sinfo; } cap_rights_init(&rights, CAP_SEND); if (uap->tolen != 0) { error = getsockaddr(&to, uap->to, uap->tolen); if (error != 0) { to = NULL; goto sctp_bad2; } cap_rights_set(&rights, CAP_CONNECT); } AUDIT_ARG_FD(uap->sd); error = getsock_cap(td, uap->sd, &rights, &fp, NULL, NULL); if (error != 0) goto sctp_bad1; #ifdef COMPAT_FREEBSD32 if (SV_CURPROC_FLAG(SV_ILP32)) error = freebsd32_copyiniov((struct iovec32 *)uap->iov, uap->iovlen, &iov, EMSGSIZE); else #endif error = copyiniov(uap->iov, uap->iovlen, &iov, EMSGSIZE); if (error != 0) goto sctp_bad1; #ifdef KTRACE if (to && (KTRPOINT(td, KTR_STRUCT))) ktrsockaddr(to); #endif so = (struct socket *)fp->f_data; if (so->so_proto->pr_protocol != IPPROTO_SCTP) { error = EOPNOTSUPP; goto sctp_bad; } #ifdef MAC error = mac_socket_check_send(td->td_ucred, so); if (error != 0) goto sctp_bad; #endif /* MAC */ auio.uio_iov = iov; auio.uio_iovcnt = uap->iovlen; auio.uio_segflg = UIO_USERSPACE; auio.uio_rw = UIO_WRITE; auio.uio_td = td; auio.uio_offset = 0; /* XXX */ auio.uio_resid = 0; tiov = iov; for (i = 0; i iovlen; i++, tiov++) { if ((auio.uio_resid += tiov->iov_len) < 0) { error = EINVAL; goto sctp_bad; } } #ifdef KTRACE if (KTRPOINT(td, KTR_GENIO)) ktruio = cloneuio(&auio); #endif /* KTRACE */ len = auio.uio_resid; CURVNET_SET(so->so_vnet); error = sctp_lower_sosend(so, to, &auio, (struct mbuf *)NULL, (struct mbuf *)NULL, uap->flags, u_sinfo, td); CURVNET_RESTORE(); if (error != 0) { if (auio.uio_resid != len && (error == ERESTART || error == EINTR || error == EWOULDBLOCK)) error = 0; /* Generation of SIGPIPE can be controlled per socket */ if (error == EPIPE && !(so->so_options & SO_NOSIGPIPE) && !(uap->flags & MSG_NOSIGNAL)) { PROC_LOCK(td->td_proc); tdsignal(td, SIGPIPE); PROC_UNLOCK(td->td_proc); } } if (error == 0) td->td_retval[0] = len - auio.uio_resid; #ifdef KTRACE if (ktruio != NULL) { ktruio->uio_resid = td->td_retval[0]; ktrgenio(uap->sd, UIO_WRITE, ktruio, error); } #endif /* KTRACE */ sctp_bad: free(iov, M_IOV); sctp_bad1: if (fp != NULL) fdrop(fp, td); sctp_bad2: free(to, M_SONAME); return (error); #else /* SCTP */ return (EOPNOTSUPP); #endif /* SCTP */ } int sys_sctp_generic_recvmsg(td, uap) struct thread *td; struct sctp_generic_recvmsg_args /* { int sd, struct iovec *iov, int iovlen, struct sockaddr *from, __socklen_t *fromlenaddr, struct sctp_sndrcvinfo *sinfo, int *msg_flags } */ *uap; { #if (defined(INET) || defined(INET6)) && defined(SCTP) uint8_t sockbufstore[256]; struct uio auio; struct iovec *iov, *tiov; struct sctp_sndrcvinfo sinfo; struct socket *so; struct file *fp = NULL; struct sockaddr *fromsa; cap_rights_t rights; #ifdef KTRACE struct uio *ktruio = NULL; #endif ssize_t len; int error, fromlen, i, msg_flags; AUDIT_ARG_FD(uap->sd); error = getsock_cap(td, uap->sd, cap_rights_init(&rights, CAP_RECV), &fp, NULL, NULL); if (error != 0) return (error); #ifdef COMPAT_FREEBSD32 if (SV_CURPROC_FLAG(SV_ILP32)) error = freebsd32_copyiniov((struct iovec32 *)uap->iov, uap->iovlen, &iov, EMSGSIZE); else #endif error = copyiniov(uap->iov, uap->iovlen, &iov, EMSGSIZE); if (error != 0) goto out1; so = fp->f_data; if (so->so_proto->pr_protocol != IPPROTO_SCTP) { error = EOPNOTSUPP; goto out; } #ifdef MAC error = mac_socket_check_receive(td->td_ucred, so); if (error != 0) goto out; #endif /* MAC */ if (uap->fromlenaddr != NULL) { error = copyin(uap->fromlenaddr, &fromlen, sizeof (fromlen)); if (error != 0) goto out; } else { fromlen = 0; } if (uap->msg_flags) { error = copyin(uap->msg_flags, &msg_flags, sizeof (int)); if (error != 0) goto out; } else { msg_flags = 0; } auio.uio_iov = iov; auio.uio_iovcnt = uap->iovlen; auio.uio_segflg = UIO_USERSPACE; auio.uio_rw = UIO_READ; auio.uio_td = td; auio.uio_offset = 0; /* XXX */ auio.uio_resid = 0; tiov = iov; for (i = 0; i iovlen; i++, tiov++) { if ((auio.uio_resid += tiov->iov_len) < 0) { error = EINVAL; goto out; } } len = auio.uio_resid; fromsa = (struct sockaddr *)sockbufstore; #ifdef KTRACE if (KTRPOINT(td, KTR_GENIO)) ktruio = cloneuio(&auio); #endif /* KTRACE */ memset(&sinfo, 0, sizeof(struct sctp_sndrcvinfo)); CURVNET_SET(so->so_vnet); error = sctp_sorecvmsg(so, &auio, (struct mbuf **)NULL, fromsa, fromlen, &msg_flags, (struct sctp_sndrcvinfo *)&sinfo, 1); CURVNET_RESTORE(); if (error != 0) { if (auio.uio_resid != len && (error == ERESTART || error == EINTR || error == EWOULDBLOCK)) error = 0; } else { if (uap->sinfo) error = copyout(&sinfo, uap->sinfo, sizeof (sinfo)); } #ifdef KTRACE if (ktruio != NULL) { ktruio->uio_resid = len - auio.uio_resid; ktrgenio(uap->sd, UIO_READ, ktruio, error); } #endif /* KTRACE */ if (error != 0) goto out; td->td_retval[0] = len - auio.uio_resid; if (fromlen && uap->from) { len = fromlen; if (len <= 0 || fromsa == NULL) len = 0; else { len = MIN(len, fromsa->sa_len); error = copyout(fromsa, uap->from, (size_t)len); if (error != 0) goto out; } error = copyout(&len, uap->fromlenaddr, sizeof (socklen_t)); if (error != 0) goto out; } #ifdef KTRACE if (KTRPOINT(td, KTR_STRUCT)) ktrsockaddr(fromsa); #endif if (uap->msg_flags) { error = copyout(&msg_flags, uap->msg_flags, sizeof (int)); if (error != 0) goto out; } out: free(iov, M_IOV); out1: if (fp != NULL) fdrop(fp, td); return (error); #else /* SCTP */ return (EOPNOTSUPP); #endif /* SCTP */ } Index: head/sys/netinet/sctp_sysctl.c =================================================================== --- head/sys/netinet/sctp_sysctl.c (revision 319721) +++ head/sys/netinet/sctp_sysctl.c (revision 319722) @@ -1,945 +1,945 @@ /*- * Copyright (c) 2007, by Cisco Systems, Inc. All rights reserved. * Copyright (c) 2008-2012, by Randall Stewart. All rights reserved. * Copyright (c) 2008-2012, by Michael Tuexen. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * a) Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * b) Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the distribution. * * c) Neither the name of Cisco Systems, Inc. nor the names of its * contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include FEATURE(sctp, "Stream Control Transmission Protocol"); /* * sysctl tunable variables */ void sctp_init_sysctls() { SCTP_BASE_SYSCTL(sctp_sendspace) = SCTPCTL_MAXDGRAM_DEFAULT; SCTP_BASE_SYSCTL(sctp_recvspace) = SCTPCTL_RECVSPACE_DEFAULT; SCTP_BASE_SYSCTL(sctp_auto_asconf) = SCTPCTL_AUTOASCONF_DEFAULT; SCTP_BASE_SYSCTL(sctp_multiple_asconfs) = SCTPCTL_MULTIPLEASCONFS_DEFAULT; SCTP_BASE_SYSCTL(sctp_ecn_enable) = SCTPCTL_ECN_ENABLE_DEFAULT; SCTP_BASE_SYSCTL(sctp_pr_enable) = SCTPCTL_PR_ENABLE_DEFAULT; SCTP_BASE_SYSCTL(sctp_auth_enable) = SCTPCTL_AUTH_ENABLE_DEFAULT; SCTP_BASE_SYSCTL(sctp_asconf_enable) = SCTPCTL_ASCONF_ENABLE_DEFAULT; SCTP_BASE_SYSCTL(sctp_reconfig_enable) = SCTPCTL_RECONFIG_ENABLE_DEFAULT; SCTP_BASE_SYSCTL(sctp_nrsack_enable) = SCTPCTL_NRSACK_ENABLE_DEFAULT; SCTP_BASE_SYSCTL(sctp_pktdrop_enable) = SCTPCTL_PKTDROP_ENABLE_DEFAULT; SCTP_BASE_SYSCTL(sctp_peer_chunk_oh) = SCTPCTL_PEER_CHKOH_DEFAULT; SCTP_BASE_SYSCTL(sctp_max_burst_default) = SCTPCTL_MAXBURST_DEFAULT; SCTP_BASE_SYSCTL(sctp_fr_max_burst_default) = SCTPCTL_FRMAXBURST_DEFAULT; SCTP_BASE_SYSCTL(sctp_max_chunks_on_queue) = SCTPCTL_MAXCHUNKS_DEFAULT; SCTP_BASE_SYSCTL(sctp_hashtblsize) = SCTPCTL_TCBHASHSIZE_DEFAULT; SCTP_BASE_SYSCTL(sctp_pcbtblsize) = SCTPCTL_PCBHASHSIZE_DEFAULT; SCTP_BASE_SYSCTL(sctp_min_split_point) = SCTPCTL_MIN_SPLIT_POINT_DEFAULT; SCTP_BASE_SYSCTL(sctp_chunkscale) = SCTPCTL_CHUNKSCALE_DEFAULT; SCTP_BASE_SYSCTL(sctp_delayed_sack_time_default) = SCTPCTL_DELAYED_SACK_TIME_DEFAULT; SCTP_BASE_SYSCTL(sctp_sack_freq_default) = SCTPCTL_SACK_FREQ_DEFAULT; SCTP_BASE_SYSCTL(sctp_system_free_resc_limit) = SCTPCTL_SYS_RESOURCE_DEFAULT; SCTP_BASE_SYSCTL(sctp_asoc_free_resc_limit) = SCTPCTL_ASOC_RESOURCE_DEFAULT; SCTP_BASE_SYSCTL(sctp_heartbeat_interval_default) = SCTPCTL_HEARTBEAT_INTERVAL_DEFAULT; SCTP_BASE_SYSCTL(sctp_pmtu_raise_time_default) = SCTPCTL_PMTU_RAISE_TIME_DEFAULT; SCTP_BASE_SYSCTL(sctp_shutdown_guard_time_default) = SCTPCTL_SHUTDOWN_GUARD_TIME_DEFAULT; SCTP_BASE_SYSCTL(sctp_secret_lifetime_default) = SCTPCTL_SECRET_LIFETIME_DEFAULT; SCTP_BASE_SYSCTL(sctp_rto_max_default) = SCTPCTL_RTO_MAX_DEFAULT; SCTP_BASE_SYSCTL(sctp_rto_min_default) = SCTPCTL_RTO_MIN_DEFAULT; SCTP_BASE_SYSCTL(sctp_rto_initial_default) = SCTPCTL_RTO_INITIAL_DEFAULT; SCTP_BASE_SYSCTL(sctp_init_rto_max_default) = SCTPCTL_INIT_RTO_MAX_DEFAULT; SCTP_BASE_SYSCTL(sctp_valid_cookie_life_default) = SCTPCTL_VALID_COOKIE_LIFE_DEFAULT; SCTP_BASE_SYSCTL(sctp_init_rtx_max_default) = SCTPCTL_INIT_RTX_MAX_DEFAULT; SCTP_BASE_SYSCTL(sctp_assoc_rtx_max_default) = SCTPCTL_ASSOC_RTX_MAX_DEFAULT; SCTP_BASE_SYSCTL(sctp_path_rtx_max_default) = SCTPCTL_PATH_RTX_MAX_DEFAULT; SCTP_BASE_SYSCTL(sctp_path_pf_threshold) = SCTPCTL_PATH_PF_THRESHOLD_DEFAULT; SCTP_BASE_SYSCTL(sctp_add_more_threshold) = SCTPCTL_ADD_MORE_ON_OUTPUT_DEFAULT; SCTP_BASE_SYSCTL(sctp_nr_incoming_streams_default) = SCTPCTL_INCOMING_STREAMS_DEFAULT; SCTP_BASE_SYSCTL(sctp_nr_outgoing_streams_default) = SCTPCTL_OUTGOING_STREAMS_DEFAULT; SCTP_BASE_SYSCTL(sctp_cmt_on_off) = SCTPCTL_CMT_ON_OFF_DEFAULT; SCTP_BASE_SYSCTL(sctp_cmt_use_dac) = SCTPCTL_CMT_USE_DAC_DEFAULT; SCTP_BASE_SYSCTL(sctp_use_cwnd_based_maxburst) = SCTPCTL_CWND_MAXBURST_DEFAULT; SCTP_BASE_SYSCTL(sctp_nat_friendly) = SCTPCTL_NAT_FRIENDLY_DEFAULT; SCTP_BASE_SYSCTL(sctp_L2_abc_variable) = SCTPCTL_ABC_L_VAR_DEFAULT; SCTP_BASE_SYSCTL(sctp_mbuf_threshold_count) = SCTPCTL_MAX_CHAINED_MBUFS_DEFAULT; SCTP_BASE_SYSCTL(sctp_do_drain) = SCTPCTL_DO_SCTP_DRAIN_DEFAULT; SCTP_BASE_SYSCTL(sctp_hb_maxburst) = SCTPCTL_HB_MAX_BURST_DEFAULT; SCTP_BASE_SYSCTL(sctp_abort_if_one_2_one_hits_limit) = SCTPCTL_ABORT_AT_LIMIT_DEFAULT; SCTP_BASE_SYSCTL(sctp_min_residual) = SCTPCTL_MIN_RESIDUAL_DEFAULT; SCTP_BASE_SYSCTL(sctp_max_retran_chunk) = SCTPCTL_MAX_RETRAN_CHUNK_DEFAULT; SCTP_BASE_SYSCTL(sctp_logging_level) = SCTPCTL_LOGGING_LEVEL_DEFAULT; SCTP_BASE_SYSCTL(sctp_default_cc_module) = SCTPCTL_DEFAULT_CC_MODULE_DEFAULT; SCTP_BASE_SYSCTL(sctp_default_ss_module) = SCTPCTL_DEFAULT_SS_MODULE_DEFAULT; SCTP_BASE_SYSCTL(sctp_default_frag_interleave) = SCTPCTL_DEFAULT_FRAG_INTERLEAVE_DEFAULT; SCTP_BASE_SYSCTL(sctp_mobility_base) = SCTPCTL_MOBILITY_BASE_DEFAULT; SCTP_BASE_SYSCTL(sctp_mobility_fasthandoff) = SCTPCTL_MOBILITY_FASTHANDOFF_DEFAULT; SCTP_BASE_SYSCTL(sctp_vtag_time_wait) = SCTPCTL_TIME_WAIT_DEFAULT; SCTP_BASE_SYSCTL(sctp_buffer_splitting) = SCTPCTL_BUFFER_SPLITTING_DEFAULT; SCTP_BASE_SYSCTL(sctp_initial_cwnd) = SCTPCTL_INITIAL_CWND_DEFAULT; SCTP_BASE_SYSCTL(sctp_rttvar_bw) = SCTPCTL_RTTVAR_BW_DEFAULT; SCTP_BASE_SYSCTL(sctp_rttvar_rtt) = SCTPCTL_RTTVAR_RTT_DEFAULT; SCTP_BASE_SYSCTL(sctp_rttvar_eqret) = SCTPCTL_RTTVAR_EQRET_DEFAULT; SCTP_BASE_SYSCTL(sctp_steady_step) = SCTPCTL_RTTVAR_STEADYS_DEFAULT; SCTP_BASE_SYSCTL(sctp_use_dccc_ecn) = SCTPCTL_RTTVAR_DCCCECN_DEFAULT; SCTP_BASE_SYSCTL(sctp_blackhole) = SCTPCTL_BLACKHOLE_DEFAULT; SCTP_BASE_SYSCTL(sctp_diag_info_code) = SCTPCTL_DIAG_INFO_CODE_DEFAULT; #if defined(SCTP_LOCAL_TRACE_BUF) memset(&SCTP_BASE_SYSCTL(sctp_log), 0, sizeof(struct sctp_log)); #endif SCTP_BASE_SYSCTL(sctp_udp_tunneling_port) = SCTPCTL_UDP_TUNNELING_PORT_DEFAULT; SCTP_BASE_SYSCTL(sctp_enable_sack_immediately) = SCTPCTL_SACK_IMMEDIATELY_ENABLE_DEFAULT; SCTP_BASE_SYSCTL(sctp_inits_include_nat_friendly) = SCTPCTL_NAT_FRIENDLY_INITS_DEFAULT; #if defined(SCTP_DEBUG) SCTP_BASE_SYSCTL(sctp_debug_on) = SCTPCTL_DEBUG_DEFAULT; #endif #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) SCTP_BASE_SYSCTL(sctp_output_unlocked) = SCTPCTL_OUTPUT_UNLOCKED_DEFAULT; #endif } /* It returns an upper limit. No filtering is done here */ static unsigned int sctp_sysctl_number_of_addresses(struct sctp_inpcb *inp) { unsigned int cnt; struct sctp_vrf *vrf; struct sctp_ifn *sctp_ifn; struct sctp_ifa *sctp_ifa; struct sctp_laddr *laddr; cnt = 0; /* neither Mac OS X nor FreeBSD support mulitple routing functions */ if ((vrf = sctp_find_vrf(inp->def_vrf_id)) == NULL) { return (0); } if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) { LIST_FOREACH(sctp_ifn, &vrf->ifnlist, next_ifn) { LIST_FOREACH(sctp_ifa, &sctp_ifn->ifalist, next_ifa) { switch (sctp_ifa->address.sa.sa_family) { #ifdef INET case AF_INET: #endif #ifdef INET6 case AF_INET6: #endif cnt++; break; default: break; } } } } else { LIST_FOREACH(laddr, &inp->sctp_addr_list, sctp_nxt_addr) { switch (laddr->ifa->address.sa.sa_family) { #ifdef INET case AF_INET: #endif #ifdef INET6 case AF_INET6: #endif cnt++; break; default: break; } } } return (cnt); } static int sctp_sysctl_copy_out_local_addresses(struct sctp_inpcb *inp, struct sctp_tcb *stcb, struct sysctl_req *req) { struct sctp_ifn *sctp_ifn; struct sctp_ifa *sctp_ifa; int loopback_scope, ipv4_local_scope, local_scope, site_scope; int ipv4_addr_legal, ipv6_addr_legal; struct sctp_vrf *vrf; struct xsctp_laddr xladdr; struct sctp_laddr *laddr; int error; /* Turn on all the appropriate scope */ if (stcb) { /* use association specific values */ loopback_scope = stcb->asoc.scope.loopback_scope; ipv4_local_scope = stcb->asoc.scope.ipv4_local_scope; local_scope = stcb->asoc.scope.local_scope; site_scope = stcb->asoc.scope.site_scope; ipv4_addr_legal = stcb->asoc.scope.ipv4_addr_legal; ipv6_addr_legal = stcb->asoc.scope.ipv6_addr_legal; } else { /* Use generic values for endpoints. */ loopback_scope = 1; ipv4_local_scope = 1; local_scope = 1; site_scope = 1; if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) { ipv6_addr_legal = 1; if (SCTP_IPV6_V6ONLY(inp)) { ipv4_addr_legal = 0; } else { ipv4_addr_legal = 1; } } else { ipv6_addr_legal = 0; ipv4_addr_legal = 1; } } /* neither Mac OS X nor FreeBSD support mulitple routing functions */ if ((vrf = sctp_find_vrf(inp->def_vrf_id)) == NULL) { SCTP_INP_RUNLOCK(inp); SCTP_INP_INFO_RUNLOCK(); return (-1); } if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) { LIST_FOREACH(sctp_ifn, &vrf->ifnlist, next_ifn) { if ((loopback_scope == 0) && SCTP_IFN_IS_IFT_LOOP(sctp_ifn)) /* Skip loopback if loopback_scope not set */ continue; LIST_FOREACH(sctp_ifa, &sctp_ifn->ifalist, next_ifa) { if (stcb) { /* * ignore if blacklisted at * association level */ if (sctp_is_addr_restricted(stcb, sctp_ifa)) continue; } switch (sctp_ifa->address.sa.sa_family) { #ifdef INET case AF_INET: if (ipv4_addr_legal) { struct sockaddr_in *sin; sin = &sctp_ifa->address.sin; if (sin->sin_addr.s_addr == 0) continue; if (prison_check_ip4(inp->ip_inp.inp.inp_cred, &sin->sin_addr) != 0) { continue; } if ((ipv4_local_scope == 0) && (IN4_ISPRIVATE_ADDRESS(&sin->sin_addr))) continue; } else { continue; } break; #endif #ifdef INET6 case AF_INET6: if (ipv6_addr_legal) { struct sockaddr_in6 *sin6; sin6 = &sctp_ifa->address.sin6; if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) continue; if (prison_check_ip6(inp->ip_inp.inp.inp_cred, &sin6->sin6_addr) != 0) { continue; } if (IN6_IS_ADDR_LINKLOCAL(&sin6->sin6_addr)) { if (local_scope == 0) continue; } if ((site_scope == 0) && (IN6_IS_ADDR_SITELOCAL(&sin6->sin6_addr))) continue; } else { continue; } break; #endif default: continue; } memset((void *)&xladdr, 0, sizeof(struct xsctp_laddr)); memcpy((void *)&xladdr.address, (const void *)&sctp_ifa->address, sizeof(union sctp_sockstore)); SCTP_INP_RUNLOCK(inp); SCTP_INP_INFO_RUNLOCK(); error = SYSCTL_OUT(req, &xladdr, sizeof(struct xsctp_laddr)); if (error) { return (error); } else { SCTP_INP_INFO_RLOCK(); SCTP_INP_RLOCK(inp); } } } } else { LIST_FOREACH(laddr, &inp->sctp_addr_list, sctp_nxt_addr) { /* ignore if blacklisted at association level */ if (stcb && sctp_is_addr_restricted(stcb, laddr->ifa)) continue; memset((void *)&xladdr, 0, sizeof(struct xsctp_laddr)); memcpy((void *)&xladdr.address, (const void *)&laddr->ifa->address, sizeof(union sctp_sockstore)); xladdr.start_time.tv_sec = (uint32_t)laddr->start_time.tv_sec; xladdr.start_time.tv_usec = (uint32_t)laddr->start_time.tv_usec; SCTP_INP_RUNLOCK(inp); SCTP_INP_INFO_RUNLOCK(); error = SYSCTL_OUT(req, &xladdr, sizeof(struct xsctp_laddr)); if (error) { return (error); } else { SCTP_INP_INFO_RLOCK(); SCTP_INP_RLOCK(inp); } } } memset((void *)&xladdr, 0, sizeof(struct xsctp_laddr)); xladdr.last = 1; SCTP_INP_RUNLOCK(inp); SCTP_INP_INFO_RUNLOCK(); error = SYSCTL_OUT(req, &xladdr, sizeof(struct xsctp_laddr)); if (error) { return (error); } else { SCTP_INP_INFO_RLOCK(); SCTP_INP_RLOCK(inp); return (0); } } /* * sysctl functions */ static int sctp_sysctl_handle_assoclist(SYSCTL_HANDLER_ARGS) { unsigned int number_of_endpoints; unsigned int number_of_local_addresses; unsigned int number_of_associations; unsigned int number_of_remote_addresses; unsigned int n; int error; struct sctp_inpcb *inp; struct sctp_tcb *stcb; struct sctp_nets *net; struct xsctp_inpcb xinpcb; struct xsctp_tcb xstcb; struct xsctp_raddr xraddr; struct socket *so; number_of_endpoints = 0; number_of_local_addresses = 0; number_of_associations = 0; number_of_remote_addresses = 0; SCTP_INP_INFO_RLOCK(); if (req->oldptr == NULL) { LIST_FOREACH(inp, &SCTP_BASE_INFO(listhead), sctp_list) { SCTP_INP_RLOCK(inp); number_of_endpoints++; number_of_local_addresses += sctp_sysctl_number_of_addresses(inp); LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) { number_of_associations++; number_of_local_addresses += sctp_sysctl_number_of_addresses(inp); TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) { number_of_remote_addresses++; } } SCTP_INP_RUNLOCK(inp); } SCTP_INP_INFO_RUNLOCK(); n = (number_of_endpoints + 1) * sizeof(struct xsctp_inpcb) + (number_of_local_addresses + number_of_endpoints + number_of_associations) * sizeof(struct xsctp_laddr) + (number_of_associations + number_of_endpoints) * sizeof(struct xsctp_tcb) + (number_of_remote_addresses + number_of_associations) * sizeof(struct xsctp_raddr); /* request some more memory than needed */ req->oldidx = (n + n / 8); return (0); } if (req->newptr != NULL) { SCTP_INP_INFO_RUNLOCK(); SCTP_LTRACE_ERR_RET(NULL, NULL, NULL, SCTP_FROM_SCTP_SYSCTL, EPERM); return (EPERM); } LIST_FOREACH(inp, &SCTP_BASE_INFO(listhead), sctp_list) { SCTP_INP_RLOCK(inp); if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) { /* if its allgone it is being freed - skip it */ goto skip; } xinpcb.last = 0; xinpcb.local_port = ntohs(inp->sctp_lport); xinpcb.flags = inp->sctp_flags; xinpcb.features = inp->sctp_features; xinpcb.total_sends = inp->total_sends; xinpcb.total_recvs = inp->total_recvs; xinpcb.total_nospaces = inp->total_nospaces; xinpcb.fragmentation_point = inp->sctp_frag_point; xinpcb.socket = inp->sctp_socket; so = inp->sctp_socket; if ((so == NULL) || (!SCTP_IS_LISTENING(inp)) || (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE)) { xinpcb.qlen = 0; xinpcb.maxqlen = 0; } else { - xinpcb.qlen = so->so_qlen; - xinpcb.qlen_old = so->so_qlen > USHRT_MAX ? - USHRT_MAX : (uint16_t)so->so_qlen; - xinpcb.maxqlen = so->so_qlimit; - xinpcb.maxqlen_old = so->so_qlimit > USHRT_MAX ? - USHRT_MAX : (uint16_t)so->so_qlimit; + xinpcb.qlen = so->sol_qlen; + xinpcb.qlen_old = so->sol_qlen > USHRT_MAX ? + USHRT_MAX : (uint16_t)so->sol_qlen; + xinpcb.maxqlen = so->sol_qlimit; + xinpcb.maxqlen_old = so->sol_qlimit > USHRT_MAX ? + USHRT_MAX : (uint16_t)so->sol_qlimit; } SCTP_INP_INCR_REF(inp); SCTP_INP_RUNLOCK(inp); SCTP_INP_INFO_RUNLOCK(); error = SYSCTL_OUT(req, &xinpcb, sizeof(struct xsctp_inpcb)); if (error) { SCTP_INP_DECR_REF(inp); return (error); } SCTP_INP_INFO_RLOCK(); SCTP_INP_RLOCK(inp); error = sctp_sysctl_copy_out_local_addresses(inp, NULL, req); if (error) { SCTP_INP_DECR_REF(inp); return (error); } LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) { SCTP_TCB_LOCK(stcb); atomic_add_int(&stcb->asoc.refcnt, 1); SCTP_TCB_UNLOCK(stcb); xstcb.last = 0; xstcb.local_port = ntohs(inp->sctp_lport); xstcb.remote_port = ntohs(stcb->rport); if (stcb->asoc.primary_destination != NULL) xstcb.primary_addr = stcb->asoc.primary_destination->ro._l_addr; xstcb.heartbeat_interval = stcb->asoc.heart_beat_delay; xstcb.state = (uint32_t)sctp_map_assoc_state(stcb->asoc.state); /* 7.0 does not support these */ xstcb.assoc_id = sctp_get_associd(stcb); xstcb.peers_rwnd = stcb->asoc.peers_rwnd; xstcb.in_streams = stcb->asoc.streamincnt; xstcb.out_streams = stcb->asoc.streamoutcnt; xstcb.max_nr_retrans = stcb->asoc.overall_error_count; xstcb.primary_process = 0; /* not really supported * yet */ xstcb.T1_expireries = stcb->asoc.timoinit + stcb->asoc.timocookie; xstcb.T2_expireries = stcb->asoc.timoshutdown + stcb->asoc.timoshutdownack; xstcb.retransmitted_tsns = stcb->asoc.marked_retrans; xstcb.start_time.tv_sec = (uint32_t)stcb->asoc.start_time.tv_sec; xstcb.start_time.tv_usec = (uint32_t)stcb->asoc.start_time.tv_usec; xstcb.discontinuity_time.tv_sec = (uint32_t)stcb->asoc.discontinuity_time.tv_sec; xstcb.discontinuity_time.tv_usec = (uint32_t)stcb->asoc.discontinuity_time.tv_usec; xstcb.total_sends = stcb->total_sends; xstcb.total_recvs = stcb->total_recvs; xstcb.local_tag = stcb->asoc.my_vtag; xstcb.remote_tag = stcb->asoc.peer_vtag; xstcb.initial_tsn = stcb->asoc.init_seq_number; xstcb.highest_tsn = stcb->asoc.sending_seq - 1; xstcb.cumulative_tsn = stcb->asoc.last_acked_seq; xstcb.cumulative_tsn_ack = stcb->asoc.cumulative_tsn; xstcb.mtu = stcb->asoc.smallest_mtu; xstcb.refcnt = stcb->asoc.refcnt; SCTP_INP_RUNLOCK(inp); SCTP_INP_INFO_RUNLOCK(); error = SYSCTL_OUT(req, &xstcb, sizeof(struct xsctp_tcb)); if (error) { SCTP_INP_DECR_REF(inp); atomic_subtract_int(&stcb->asoc.refcnt, 1); return (error); } SCTP_INP_INFO_RLOCK(); SCTP_INP_RLOCK(inp); error = sctp_sysctl_copy_out_local_addresses(inp, stcb, req); if (error) { SCTP_INP_DECR_REF(inp); atomic_subtract_int(&stcb->asoc.refcnt, 1); return (error); } TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) { xraddr.last = 0; xraddr.address = net->ro._l_addr; xraddr.active = ((net->dest_state & SCTP_ADDR_REACHABLE) == SCTP_ADDR_REACHABLE); xraddr.confirmed = ((net->dest_state & SCTP_ADDR_UNCONFIRMED) == 0); xraddr.heartbeat_enabled = ((net->dest_state & SCTP_ADDR_NOHB) == 0); xraddr.potentially_failed = ((net->dest_state & SCTP_ADDR_PF) == SCTP_ADDR_PF); xraddr.rto = net->RTO; xraddr.max_path_rtx = net->failure_threshold; xraddr.rtx = net->marked_retrans; xraddr.error_counter = net->error_count; xraddr.cwnd = net->cwnd; xraddr.flight_size = net->flight_size; xraddr.mtu = net->mtu; xraddr.rtt = net->rtt / 1000; xraddr.heartbeat_interval = net->heart_beat_delay; xraddr.ssthresh = net->ssthresh; xraddr.start_time.tv_sec = (uint32_t)net->start_time.tv_sec; xraddr.start_time.tv_usec = (uint32_t)net->start_time.tv_usec; SCTP_INP_RUNLOCK(inp); SCTP_INP_INFO_RUNLOCK(); error = SYSCTL_OUT(req, &xraddr, sizeof(struct xsctp_raddr)); if (error) { SCTP_INP_DECR_REF(inp); atomic_subtract_int(&stcb->asoc.refcnt, 1); return (error); } SCTP_INP_INFO_RLOCK(); SCTP_INP_RLOCK(inp); } atomic_subtract_int(&stcb->asoc.refcnt, 1); memset((void *)&xraddr, 0, sizeof(struct xsctp_raddr)); xraddr.last = 1; SCTP_INP_RUNLOCK(inp); SCTP_INP_INFO_RUNLOCK(); error = SYSCTL_OUT(req, &xraddr, sizeof(struct xsctp_raddr)); if (error) { SCTP_INP_DECR_REF(inp); return (error); } SCTP_INP_INFO_RLOCK(); SCTP_INP_RLOCK(inp); } SCTP_INP_DECR_REF(inp); SCTP_INP_RUNLOCK(inp); SCTP_INP_INFO_RUNLOCK(); memset((void *)&xstcb, 0, sizeof(struct xsctp_tcb)); xstcb.last = 1; error = SYSCTL_OUT(req, &xstcb, sizeof(struct xsctp_tcb)); if (error) { return (error); } skip: SCTP_INP_INFO_RLOCK(); } SCTP_INP_INFO_RUNLOCK(); memset((void *)&xinpcb, 0, sizeof(struct xsctp_inpcb)); xinpcb.last = 1; error = SYSCTL_OUT(req, &xinpcb, sizeof(struct xsctp_inpcb)); return (error); } static int sctp_sysctl_handle_udp_tunneling(SYSCTL_HANDLER_ARGS) { int error; uint32_t old, new; SCTP_INP_INFO_RLOCK(); old = SCTP_BASE_SYSCTL(sctp_udp_tunneling_port); SCTP_INP_INFO_RUNLOCK(); new = old; error = sysctl_handle_int(oidp, &new, 0, req); if ((error == 0) && (req->newptr != NULL)) { #if (SCTPCTL_UDP_TUNNELING_PORT_MIN == 0) if (new > SCTPCTL_UDP_TUNNELING_PORT_MAX) { #else if ((new < SCTPCTL_UDP_TUNNELING_PORT_MIN) || (new > SCTPCTL_UDP_TUNNELING_PORT_MAX)) { #endif error = EINVAL; } else { SCTP_INP_INFO_WLOCK(); SCTP_BASE_SYSCTL(sctp_udp_tunneling_port) = new; if (old != 0) { sctp_over_udp_stop(); } if (new != 0) { error = sctp_over_udp_start(); } SCTP_INP_INFO_WUNLOCK(); } } return (error); } static int sctp_sysctl_handle_auth(SYSCTL_HANDLER_ARGS) { int error; uint32_t new; new = SCTP_BASE_SYSCTL(sctp_auth_enable); error = sysctl_handle_int(oidp, &new, 0, req); if ((error == 0) && (req->newptr != NULL)) { #if (SCTPCTL_AUTH_ENABLE_MIN == 0) if ((new > SCTPCTL_AUTH_ENABLE_MAX) || ((new == 0) && (SCTP_BASE_SYSCTL(sctp_asconf_enable) == 1))) { #else if ((new < SCTPCTL_AUTH_ENABLE_MIN) || (new > SCTPCTL_AUTH_ENABLE_MAX) || ((new == 0) && (SCTP_BASE_SYSCTL(sctp_asconf_enable) == 1))) { #endif error = EINVAL; } else { SCTP_BASE_SYSCTL(sctp_auth_enable) = new; } } return (error); } static int sctp_sysctl_handle_asconf(SYSCTL_HANDLER_ARGS) { int error; uint32_t new; new = SCTP_BASE_SYSCTL(sctp_asconf_enable); error = sysctl_handle_int(oidp, &new, 0, req); if ((error == 0) && (req->newptr != NULL)) { #if (SCTPCTL_ASCONF_ENABLE_MIN == 0) if ((new > SCTPCTL_ASCONF_ENABLE_MAX) || ((new == 1) && (SCTP_BASE_SYSCTL(sctp_auth_enable) == 0))) { #else if ((new < SCTPCTL_ASCONF_ENABLE_MIN) || (new > SCTPCTL_ASCONF_ENABLE_MAX) || ((new == 1) && (SCTP_BASE_SYSCTL(sctp_auth_enable) == 0))) { #endif error = EINVAL; } else { SCTP_BASE_SYSCTL(sctp_asconf_enable) = new; } } return (error); } static int sctp_sysctl_handle_stats(SYSCTL_HANDLER_ARGS) { int error; #if defined(SMP) && defined(SCTP_USE_PERCPU_STAT) struct sctpstat *sarry; struct sctpstat sb; int cpu; #endif struct sctpstat sb_temp; if ((req->newptr != NULL) && (req->newlen != sizeof(struct sctpstat))) { return (EINVAL); } memset(&sb_temp, 0, sizeof(struct sctpstat)); if (req->newptr != NULL) { error = SYSCTL_IN(req, &sb_temp, sizeof(struct sctpstat)); if (error != 0) { return (error); } } #if defined(SMP) && defined(SCTP_USE_PERCPU_STAT) memset(&sb, 0, sizeof(sb)); for (cpu = 0; cpu < mp_maxid; cpu++) { sarry = &SCTP_BASE_STATS[cpu]; if (sarry->sctps_discontinuitytime.tv_sec > sb.sctps_discontinuitytime.tv_sec) { sb.sctps_discontinuitytime.tv_sec = sarry->sctps_discontinuitytime.tv_sec; sb.sctps_discontinuitytime.tv_usec = sarry->sctps_discontinuitytime.tv_usec; } sb.sctps_currestab += sarry->sctps_currestab; sb.sctps_activeestab += sarry->sctps_activeestab; sb.sctps_restartestab += sarry->sctps_restartestab; sb.sctps_collisionestab += sarry->sctps_collisionestab; sb.sctps_passiveestab += sarry->sctps_passiveestab; sb.sctps_aborted += sarry->sctps_aborted; sb.sctps_shutdown += sarry->sctps_shutdown; sb.sctps_outoftheblue += sarry->sctps_outoftheblue; sb.sctps_checksumerrors += sarry->sctps_checksumerrors; sb.sctps_outcontrolchunks += sarry->sctps_outcontrolchunks; sb.sctps_outorderchunks += sarry->sctps_outorderchunks; sb.sctps_outunorderchunks += sarry->sctps_outunorderchunks; sb.sctps_incontrolchunks += sarry->sctps_incontrolchunks; sb.sctps_inorderchunks += sarry->sctps_inorderchunks; sb.sctps_inunorderchunks += sarry->sctps_inunorderchunks; sb.sctps_fragusrmsgs += sarry->sctps_fragusrmsgs; sb.sctps_reasmusrmsgs += sarry->sctps_reasmusrmsgs; sb.sctps_outpackets += sarry->sctps_outpackets; sb.sctps_inpackets += sarry->sctps_inpackets; sb.sctps_recvpackets += sarry->sctps_recvpackets; sb.sctps_recvdatagrams += sarry->sctps_recvdatagrams; sb.sctps_recvpktwithdata += sarry->sctps_recvpktwithdata; sb.sctps_recvsacks += sarry->sctps_recvsacks; sb.sctps_recvdata += sarry->sctps_recvdata; sb.sctps_recvdupdata += sarry->sctps_recvdupdata; sb.sctps_recvheartbeat += sarry->sctps_recvheartbeat; sb.sctps_recvheartbeatack += sarry->sctps_recvheartbeatack; sb.sctps_recvecne += sarry->sctps_recvecne; sb.sctps_recvauth += sarry->sctps_recvauth; sb.sctps_recvauthmissing += sarry->sctps_recvauthmissing; sb.sctps_recvivalhmacid += sarry->sctps_recvivalhmacid; sb.sctps_recvivalkeyid += sarry->sctps_recvivalkeyid; sb.sctps_recvauthfailed += sarry->sctps_recvauthfailed; sb.sctps_recvexpress += sarry->sctps_recvexpress; sb.sctps_recvexpressm += sarry->sctps_recvexpressm; sb.sctps_recvnocrc += sarry->sctps_recvnocrc; sb.sctps_recvswcrc += sarry->sctps_recvswcrc; sb.sctps_recvhwcrc += sarry->sctps_recvhwcrc; sb.sctps_sendpackets += sarry->sctps_sendpackets; sb.sctps_sendsacks += sarry->sctps_sendsacks; sb.sctps_senddata += sarry->sctps_senddata; sb.sctps_sendretransdata += sarry->sctps_sendretransdata; sb.sctps_sendfastretrans += sarry->sctps_sendfastretrans; sb.sctps_sendmultfastretrans += sarry->sctps_sendmultfastretrans; sb.sctps_sendheartbeat += sarry->sctps_sendheartbeat; sb.sctps_sendecne += sarry->sctps_sendecne; sb.sctps_sendauth += sarry->sctps_sendauth; sb.sctps_senderrors += sarry->sctps_senderrors; sb.sctps_sendnocrc += sarry->sctps_sendnocrc; sb.sctps_sendswcrc += sarry->sctps_sendswcrc; sb.sctps_sendhwcrc += sarry->sctps_sendhwcrc; sb.sctps_pdrpfmbox += sarry->sctps_pdrpfmbox; sb.sctps_pdrpfehos += sarry->sctps_pdrpfehos; sb.sctps_pdrpmbda += sarry->sctps_pdrpmbda; sb.sctps_pdrpmbct += sarry->sctps_pdrpmbct; sb.sctps_pdrpbwrpt += sarry->sctps_pdrpbwrpt; sb.sctps_pdrpcrupt += sarry->sctps_pdrpcrupt; sb.sctps_pdrpnedat += sarry->sctps_pdrpnedat; sb.sctps_pdrppdbrk += sarry->sctps_pdrppdbrk; sb.sctps_pdrptsnnf += sarry->sctps_pdrptsnnf; sb.sctps_pdrpdnfnd += sarry->sctps_pdrpdnfnd; sb.sctps_pdrpdiwnp += sarry->sctps_pdrpdiwnp; sb.sctps_pdrpdizrw += sarry->sctps_pdrpdizrw; sb.sctps_pdrpbadd += sarry->sctps_pdrpbadd; sb.sctps_pdrpmark += sarry->sctps_pdrpmark; sb.sctps_timoiterator += sarry->sctps_timoiterator; sb.sctps_timodata += sarry->sctps_timodata; sb.sctps_timowindowprobe += sarry->sctps_timowindowprobe; sb.sctps_timoinit += sarry->sctps_timoinit; sb.sctps_timosack += sarry->sctps_timosack; sb.sctps_timoshutdown += sarry->sctps_timoshutdown; sb.sctps_timoheartbeat += sarry->sctps_timoheartbeat; sb.sctps_timocookie += sarry->sctps_timocookie; sb.sctps_timosecret += sarry->sctps_timosecret; sb.sctps_timopathmtu += sarry->sctps_timopathmtu; sb.sctps_timoshutdownack += sarry->sctps_timoshutdownack; sb.sctps_timoshutdownguard += sarry->sctps_timoshutdownguard; sb.sctps_timostrmrst += sarry->sctps_timostrmrst; sb.sctps_timoearlyfr += sarry->sctps_timoearlyfr; sb.sctps_timoasconf += sarry->sctps_timoasconf; sb.sctps_timodelprim += sarry->sctps_timodelprim; sb.sctps_timoautoclose += sarry->sctps_timoautoclose; sb.sctps_timoassockill += sarry->sctps_timoassockill; sb.sctps_timoinpkill += sarry->sctps_timoinpkill; sb.sctps_hdrops += sarry->sctps_hdrops; sb.sctps_badsum += sarry->sctps_badsum; sb.sctps_noport += sarry->sctps_noport; sb.sctps_badvtag += sarry->sctps_badvtag; sb.sctps_badsid += sarry->sctps_badsid; sb.sctps_nomem += sarry->sctps_nomem; sb.sctps_fastretransinrtt += sarry->sctps_fastretransinrtt; sb.sctps_markedretrans += sarry->sctps_markedretrans; sb.sctps_naglesent += sarry->sctps_naglesent; sb.sctps_naglequeued += sarry->sctps_naglequeued; sb.sctps_maxburstqueued += sarry->sctps_maxburstqueued; sb.sctps_ifnomemqueued += sarry->sctps_ifnomemqueued; sb.sctps_windowprobed += sarry->sctps_windowprobed; sb.sctps_lowlevelerr += sarry->sctps_lowlevelerr; sb.sctps_lowlevelerrusr += sarry->sctps_lowlevelerrusr; sb.sctps_datadropchklmt += sarry->sctps_datadropchklmt; sb.sctps_datadroprwnd += sarry->sctps_datadroprwnd; sb.sctps_ecnereducedcwnd += sarry->sctps_ecnereducedcwnd; sb.sctps_vtagexpress += sarry->sctps_vtagexpress; sb.sctps_vtagbogus += sarry->sctps_vtagbogus; sb.sctps_primary_randry += sarry->sctps_primary_randry; sb.sctps_cmt_randry += sarry->sctps_cmt_randry; sb.sctps_slowpath_sack += sarry->sctps_slowpath_sack; sb.sctps_wu_sacks_sent += sarry->sctps_wu_sacks_sent; sb.sctps_sends_with_flags += sarry->sctps_sends_with_flags; sb.sctps_sends_with_unord += sarry->sctps_sends_with_unord; sb.sctps_sends_with_eof += sarry->sctps_sends_with_eof; sb.sctps_sends_with_abort += sarry->sctps_sends_with_abort; sb.sctps_protocol_drain_calls += sarry->sctps_protocol_drain_calls; sb.sctps_protocol_drains_done += sarry->sctps_protocol_drains_done; sb.sctps_read_peeks += sarry->sctps_read_peeks; sb.sctps_cached_chk += sarry->sctps_cached_chk; sb.sctps_cached_strmoq += sarry->sctps_cached_strmoq; sb.sctps_left_abandon += sarry->sctps_left_abandon; sb.sctps_send_burst_avoid += sarry->sctps_send_burst_avoid; sb.sctps_send_cwnd_avoid += sarry->sctps_send_cwnd_avoid; sb.sctps_fwdtsn_map_over += sarry->sctps_fwdtsn_map_over; if (req->newptr != NULL) { memcpy(sarry, &sb_temp, sizeof(struct sctpstat)); } } error = SYSCTL_OUT(req, &sb, sizeof(struct sctpstat)); #else error = SYSCTL_OUT(req, &SCTP_BASE_STATS, sizeof(struct sctpstat)); if (error != 0) { return (error); } if (req->newptr != NULL) { memcpy(&SCTP_BASE_STATS, &sb_temp, sizeof(struct sctpstat)); } #endif return (error); } #if defined(SCTP_LOCAL_TRACE_BUF) static int sctp_sysctl_handle_trace_log(SYSCTL_HANDLER_ARGS) { int error; error = SYSCTL_OUT(req, &SCTP_BASE_SYSCTL(sctp_log), sizeof(struct sctp_log)); return (error); } static int sctp_sysctl_handle_trace_log_clear(SYSCTL_HANDLER_ARGS) { int error = 0; memset(&SCTP_BASE_SYSCTL(sctp_log), 0, sizeof(struct sctp_log)); return (error); } #endif #define SCTP_UINT_SYSCTL(mib_name, var_name, prefix) \ static int \ sctp_sysctl_handle_##mib_name(SYSCTL_HANDLER_ARGS) \ { \ int error; \ uint32_t new; \ \ new = SCTP_BASE_SYSCTL(var_name); \ error = sysctl_handle_int(oidp, &new, 0, req); \ if ((error == 0) && (req->newptr != NULL)) { \ if ((new < prefix##_MIN) || \ (new > prefix##_MAX)) { \ error = EINVAL; \ } else { \ SCTP_BASE_SYSCTL(var_name) = new; \ } \ } \ return (error); \ } \ SYSCTL_PROC(_net_inet_sctp, OID_AUTO, mib_name, \ CTLFLAG_VNET|CTLTYPE_UINT|CTLFLAG_RW, NULL, 0, \ sctp_sysctl_handle_##mib_name, "UI", prefix##_DESC); /* * sysctl definitions */ SCTP_UINT_SYSCTL(sendspace, sctp_sendspace, SCTPCTL_MAXDGRAM) SCTP_UINT_SYSCTL(recvspace, sctp_recvspace, SCTPCTL_RECVSPACE) SCTP_UINT_SYSCTL(auto_asconf, sctp_auto_asconf, SCTPCTL_AUTOASCONF) SCTP_UINT_SYSCTL(ecn_enable, sctp_ecn_enable, SCTPCTL_ECN_ENABLE) SCTP_UINT_SYSCTL(pr_enable, sctp_pr_enable, SCTPCTL_PR_ENABLE) SYSCTL_PROC(_net_inet_sctp, OID_AUTO, auth_enable, CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RW, NULL, 0, sctp_sysctl_handle_auth, "IU", SCTPCTL_AUTH_ENABLE_DESC); SYSCTL_PROC(_net_inet_sctp, OID_AUTO, asconf_enable, CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RW, NULL, 0, sctp_sysctl_handle_asconf, "IU", SCTPCTL_ASCONF_ENABLE_DESC); SCTP_UINT_SYSCTL(reconfig_enable, sctp_reconfig_enable, SCTPCTL_RECONFIG_ENABLE) SCTP_UINT_SYSCTL(nrsack_enable, sctp_nrsack_enable, SCTPCTL_NRSACK_ENABLE) SCTP_UINT_SYSCTL(pktdrop_enable, sctp_pktdrop_enable, SCTPCTL_PKTDROP_ENABLE) SCTP_UINT_SYSCTL(peer_chkoh, sctp_peer_chunk_oh, SCTPCTL_PEER_CHKOH) SCTP_UINT_SYSCTL(maxburst, sctp_max_burst_default, SCTPCTL_MAXBURST) SCTP_UINT_SYSCTL(fr_maxburst, sctp_fr_max_burst_default, SCTPCTL_FRMAXBURST) SCTP_UINT_SYSCTL(maxchunks, sctp_max_chunks_on_queue, SCTPCTL_MAXCHUNKS) SCTP_UINT_SYSCTL(tcbhashsize, sctp_hashtblsize, SCTPCTL_TCBHASHSIZE) SCTP_UINT_SYSCTL(pcbhashsize, sctp_pcbtblsize, SCTPCTL_PCBHASHSIZE) SCTP_UINT_SYSCTL(min_split_point, sctp_min_split_point, SCTPCTL_MIN_SPLIT_POINT) SCTP_UINT_SYSCTL(chunkscale, sctp_chunkscale, SCTPCTL_CHUNKSCALE) SCTP_UINT_SYSCTL(delayed_sack_time, sctp_delayed_sack_time_default, SCTPCTL_DELAYED_SACK_TIME) SCTP_UINT_SYSCTL(sack_freq, sctp_sack_freq_default, SCTPCTL_SACK_FREQ) SCTP_UINT_SYSCTL(sys_resource, sctp_system_free_resc_limit, SCTPCTL_SYS_RESOURCE) SCTP_UINT_SYSCTL(asoc_resource, sctp_asoc_free_resc_limit, SCTPCTL_ASOC_RESOURCE) SCTP_UINT_SYSCTL(heartbeat_interval, sctp_heartbeat_interval_default, SCTPCTL_HEARTBEAT_INTERVAL) SCTP_UINT_SYSCTL(pmtu_raise_time, sctp_pmtu_raise_time_default, SCTPCTL_PMTU_RAISE_TIME) SCTP_UINT_SYSCTL(shutdown_guard_time, sctp_shutdown_guard_time_default, SCTPCTL_SHUTDOWN_GUARD_TIME) SCTP_UINT_SYSCTL(secret_lifetime, sctp_secret_lifetime_default, SCTPCTL_SECRET_LIFETIME) SCTP_UINT_SYSCTL(rto_max, sctp_rto_max_default, SCTPCTL_RTO_MAX) SCTP_UINT_SYSCTL(rto_min, sctp_rto_min_default, SCTPCTL_RTO_MIN) SCTP_UINT_SYSCTL(rto_initial, sctp_rto_initial_default, SCTPCTL_RTO_INITIAL) SCTP_UINT_SYSCTL(init_rto_max, sctp_init_rto_max_default, SCTPCTL_INIT_RTO_MAX) SCTP_UINT_SYSCTL(valid_cookie_life, sctp_valid_cookie_life_default, SCTPCTL_VALID_COOKIE_LIFE) SCTP_UINT_SYSCTL(init_rtx_max, sctp_init_rtx_max_default, SCTPCTL_INIT_RTX_MAX) SCTP_UINT_SYSCTL(assoc_rtx_max, sctp_assoc_rtx_max_default, SCTPCTL_ASSOC_RTX_MAX) SCTP_UINT_SYSCTL(path_rtx_max, sctp_path_rtx_max_default, SCTPCTL_PATH_RTX_MAX) SCTP_UINT_SYSCTL(path_pf_threshold, sctp_path_pf_threshold, SCTPCTL_PATH_PF_THRESHOLD) SCTP_UINT_SYSCTL(add_more_on_output, sctp_add_more_threshold, SCTPCTL_ADD_MORE_ON_OUTPUT) SCTP_UINT_SYSCTL(incoming_streams, sctp_nr_incoming_streams_default, SCTPCTL_INCOMING_STREAMS) SCTP_UINT_SYSCTL(outgoing_streams, sctp_nr_outgoing_streams_default, SCTPCTL_OUTGOING_STREAMS) SCTP_UINT_SYSCTL(cmt_on_off, sctp_cmt_on_off, SCTPCTL_CMT_ON_OFF) SCTP_UINT_SYSCTL(cmt_use_dac, sctp_cmt_use_dac, SCTPCTL_CMT_USE_DAC) SCTP_UINT_SYSCTL(cwnd_maxburst, sctp_use_cwnd_based_maxburst, SCTPCTL_CWND_MAXBURST) SCTP_UINT_SYSCTL(nat_friendly, sctp_nat_friendly, SCTPCTL_NAT_FRIENDLY) SCTP_UINT_SYSCTL(abc_l_var, sctp_L2_abc_variable, SCTPCTL_ABC_L_VAR) SCTP_UINT_SYSCTL(max_chained_mbufs, sctp_mbuf_threshold_count, SCTPCTL_MAX_CHAINED_MBUFS) SCTP_UINT_SYSCTL(do_sctp_drain, sctp_do_drain, SCTPCTL_DO_SCTP_DRAIN) SCTP_UINT_SYSCTL(hb_max_burst, sctp_hb_maxburst, SCTPCTL_HB_MAX_BURST) SCTP_UINT_SYSCTL(abort_at_limit, sctp_abort_if_one_2_one_hits_limit, SCTPCTL_ABORT_AT_LIMIT) SCTP_UINT_SYSCTL(min_residual, sctp_min_residual, SCTPCTL_MIN_RESIDUAL) SCTP_UINT_SYSCTL(max_retran_chunk, sctp_max_retran_chunk, SCTPCTL_MAX_RETRAN_CHUNK) SCTP_UINT_SYSCTL(log_level, sctp_logging_level, SCTPCTL_LOGGING_LEVEL) SCTP_UINT_SYSCTL(default_cc_module, sctp_default_cc_module, SCTPCTL_DEFAULT_CC_MODULE) SCTP_UINT_SYSCTL(default_ss_module, sctp_default_ss_module, SCTPCTL_DEFAULT_SS_MODULE) SCTP_UINT_SYSCTL(default_frag_interleave, sctp_default_frag_interleave, SCTPCTL_DEFAULT_FRAG_INTERLEAVE) SCTP_UINT_SYSCTL(mobility_base, sctp_mobility_base, SCTPCTL_MOBILITY_BASE) SCTP_UINT_SYSCTL(mobility_fasthandoff, sctp_mobility_fasthandoff, SCTPCTL_MOBILITY_FASTHANDOFF) #if defined(SCTP_LOCAL_TRACE_BUF) SYSCTL_PROC(_net_inet_sctp, OID_AUTO, log, CTLFLAG_VNET | CTLTYPE_STRUCT | CTLFLAG_RD, NULL, 0, sctp_sysctl_handle_trace_log, "S,sctplog", "SCTP logging (struct sctp_log)"); SYSCTL_PROC(_net_inet_sctp, OID_AUTO, clear_trace, CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RW, NULL, 0, sctp_sysctl_handle_trace_log_clear, "IU", "Clear SCTP Logging buffer"); #endif SYSCTL_PROC(_net_inet_sctp, OID_AUTO, udp_tunneling_port, CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RW, NULL, 0, sctp_sysctl_handle_udp_tunneling, "IU", SCTPCTL_UDP_TUNNELING_PORT_DESC); SCTP_UINT_SYSCTL(enable_sack_immediately, sctp_enable_sack_immediately, SCTPCTL_SACK_IMMEDIATELY_ENABLE) SCTP_UINT_SYSCTL(nat_friendly_init, sctp_inits_include_nat_friendly, SCTPCTL_NAT_FRIENDLY_INITS) SCTP_UINT_SYSCTL(vtag_time_wait, sctp_vtag_time_wait, SCTPCTL_TIME_WAIT) SCTP_UINT_SYSCTL(buffer_splitting, sctp_buffer_splitting, SCTPCTL_BUFFER_SPLITTING) SCTP_UINT_SYSCTL(initial_cwnd, sctp_initial_cwnd, SCTPCTL_INITIAL_CWND) SCTP_UINT_SYSCTL(rttvar_bw, sctp_rttvar_bw, SCTPCTL_RTTVAR_BW) SCTP_UINT_SYSCTL(rttvar_rtt, sctp_rttvar_rtt, SCTPCTL_RTTVAR_RTT) SCTP_UINT_SYSCTL(rttvar_eqret, sctp_rttvar_eqret, SCTPCTL_RTTVAR_EQRET) SCTP_UINT_SYSCTL(rttvar_steady_step, sctp_steady_step, SCTPCTL_RTTVAR_STEADYS) SCTP_UINT_SYSCTL(use_dcccecn, sctp_use_dccc_ecn, SCTPCTL_RTTVAR_DCCCECN) SCTP_UINT_SYSCTL(blackhole, sctp_blackhole, SCTPCTL_BLACKHOLE) SCTP_UINT_SYSCTL(diag_info_code, sctp_diag_info_code, SCTPCTL_DIAG_INFO_CODE) #ifdef SCTP_DEBUG SCTP_UINT_SYSCTL(debug, sctp_debug_on, SCTPCTL_DEBUG) #endif #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) SCTP_UINT_SYSCTL(output_unlocked, sctp_output_unlocked, SCTPCTL_OUTPUT_UNLOCKED) #endif SYSCTL_PROC(_net_inet_sctp, OID_AUTO, stats, CTLFLAG_VNET | CTLTYPE_STRUCT | CTLFLAG_RW, NULL, 0, sctp_sysctl_handle_stats, "S,sctpstat", "SCTP statistics (struct sctp_stat)"); SYSCTL_PROC(_net_inet_sctp, OID_AUTO, assoclist, CTLFLAG_VNET | CTLTYPE_OPAQUE | CTLFLAG_RD, NULL, 0, sctp_sysctl_handle_assoclist, "S,xassoc", "List of active SCTP associations"); Index: head/sys/netinet/sctp_usrreq.c =================================================================== --- head/sys/netinet/sctp_usrreq.c (revision 319721) +++ head/sys/netinet/sctp_usrreq.c (revision 319722) @@ -1,7450 +1,7443 @@ /*- * Copyright (c) 2001-2008, by Cisco Systems, Inc. All rights reserved. * Copyright (c) 2008-2012, by Randall Stewart. All rights reserved. * Copyright (c) 2008-2012, by Michael Tuexen. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * a) Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * b) Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the distribution. * * c) Neither the name of Cisco Systems, Inc. nor the names of its * contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #ifdef INET6 #include #endif #include #include #include #include #include #include #include #include #include #include extern const struct sctp_cc_functions sctp_cc_functions[]; extern const struct sctp_ss_functions sctp_ss_functions[]; void sctp_init(void) { u_long sb_max_adj; /* Initialize and modify the sysctled variables */ sctp_init_sysctls(); if ((nmbclusters / 8) > SCTP_ASOC_MAX_CHUNKS_ON_QUEUE) SCTP_BASE_SYSCTL(sctp_max_chunks_on_queue) = (nmbclusters / 8); /* * Allow a user to take no more than 1/2 the number of clusters or * the SB_MAX whichever is smaller for the send window. */ sb_max_adj = (u_long)((u_quad_t)(SB_MAX) * MCLBYTES / (MSIZE + MCLBYTES)); SCTP_BASE_SYSCTL(sctp_sendspace) = min(sb_max_adj, (((uint32_t)nmbclusters / 2) * SCTP_DEFAULT_MAXSEGMENT)); /* * Now for the recv window, should we take the same amount? or * should I do 1/2 the SB_MAX instead in the SB_MAX min above. For * now I will just copy. */ SCTP_BASE_SYSCTL(sctp_recvspace) = SCTP_BASE_SYSCTL(sctp_sendspace); SCTP_BASE_VAR(first_time) = 0; SCTP_BASE_VAR(sctp_pcb_initialized) = 0; sctp_pcb_init(); #if defined(SCTP_PACKET_LOGGING) SCTP_BASE_VAR(packet_log_writers) = 0; SCTP_BASE_VAR(packet_log_end) = 0; bzero(&SCTP_BASE_VAR(packet_log_buffer), SCTP_PACKET_LOG_SIZE); #endif } #ifdef VIMAGE static void sctp_finish(void *unused __unused) { sctp_pcb_finish(); } VNET_SYSUNINIT(sctp, SI_SUB_PROTO_DOMAIN, SI_ORDER_FOURTH, sctp_finish, NULL); #endif void sctp_pathmtu_adjustment(struct sctp_tcb *stcb, uint16_t nxtsz) { struct sctp_tmit_chunk *chk; uint16_t overhead; /* Adjust that too */ stcb->asoc.smallest_mtu = nxtsz; /* now off to subtract IP_DF flag if needed */ overhead = IP_HDR_SIZE + sizeof(struct sctphdr); if (sctp_auth_is_required_chunk(SCTP_DATA, stcb->asoc.peer_auth_chunks)) { overhead += sctp_get_auth_chunk_len(stcb->asoc.peer_hmac_id); } TAILQ_FOREACH(chk, &stcb->asoc.send_queue, sctp_next) { if ((chk->send_size + overhead) > nxtsz) { chk->flags |= CHUNK_FLAGS_FRAGMENT_OK; } } TAILQ_FOREACH(chk, &stcb->asoc.sent_queue, sctp_next) { if ((chk->send_size + overhead) > nxtsz) { /* * For this guy we also mark for immediate resend * since we sent to big of chunk */ chk->flags |= CHUNK_FLAGS_FRAGMENT_OK; if (chk->sent < SCTP_DATAGRAM_RESEND) { sctp_flight_size_decrease(chk); sctp_total_flight_decrease(stcb, chk); chk->sent = SCTP_DATAGRAM_RESEND; sctp_ucount_incr(stcb->asoc.sent_queue_retran_cnt); chk->rec.data.doing_fast_retransmit = 0; if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_FLIGHT_LOGGING_ENABLE) { sctp_misc_ints(SCTP_FLIGHT_LOG_DOWN_PMTU, chk->whoTo->flight_size, chk->book_size, (uint32_t)(uintptr_t)chk->whoTo, chk->rec.data.tsn); } /* Clear any time so NO RTT is being done */ chk->do_rtt = 0; } } } } #ifdef INET void sctp_notify(struct sctp_inpcb *inp, struct sctp_tcb *stcb, struct sctp_nets *net, uint8_t icmp_type, uint8_t icmp_code, uint16_t ip_len, uint32_t next_mtu) { #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) struct socket *so; #endif int timer_stopped; if (icmp_type != ICMP_UNREACH) { /* We only care about unreachable */ SCTP_TCB_UNLOCK(stcb); return; } if ((icmp_code == ICMP_UNREACH_NET) || (icmp_code == ICMP_UNREACH_HOST) || (icmp_code == ICMP_UNREACH_NET_UNKNOWN) || (icmp_code == ICMP_UNREACH_HOST_UNKNOWN) || (icmp_code == ICMP_UNREACH_ISOLATED) || (icmp_code == ICMP_UNREACH_NET_PROHIB) || (icmp_code == ICMP_UNREACH_HOST_PROHIB) || (icmp_code == ICMP_UNREACH_FILTER_PROHIB)) { /* Mark the net unreachable. */ if (net->dest_state & SCTP_ADDR_REACHABLE) { /* OK, that destination is NOT reachable. */ net->dest_state &= ~SCTP_ADDR_REACHABLE; net->dest_state &= ~SCTP_ADDR_PF; sctp_ulp_notify(SCTP_NOTIFY_INTERFACE_DOWN, stcb, 0, (void *)net, SCTP_SO_NOT_LOCKED); } SCTP_TCB_UNLOCK(stcb); } else if ((icmp_code == ICMP_UNREACH_PROTOCOL) || (icmp_code == ICMP_UNREACH_PORT)) { /* Treat it like an ABORT. */ sctp_abort_notification(stcb, 1, 0, NULL, SCTP_SO_NOT_LOCKED); #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) so = SCTP_INP_SO(inp); atomic_add_int(&stcb->asoc.refcnt, 1); SCTP_TCB_UNLOCK(stcb); SCTP_SOCKET_LOCK(so, 1); SCTP_TCB_LOCK(stcb); atomic_subtract_int(&stcb->asoc.refcnt, 1); #endif (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTP_USRREQ + SCTP_LOC_2); #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) SCTP_SOCKET_UNLOCK(so, 1); /* SCTP_TCB_UNLOCK(stcb); MT: I think this is not needed. */ #endif /* no need to unlock here, since the TCB is gone */ } else if (icmp_code == ICMP_UNREACH_NEEDFRAG) { /* Find the next (smaller) MTU */ if (next_mtu == 0) { /* * Old type router that does not tell us what the * next MTU is. Rats we will have to guess (in a * educated fashion of course). */ next_mtu = sctp_get_prev_mtu(ip_len); } /* Stop the PMTU timer. */ if (SCTP_OS_TIMER_PENDING(&net->pmtu_timer.timer)) { timer_stopped = 1; sctp_timer_stop(SCTP_TIMER_TYPE_PATHMTURAISE, inp, stcb, net, SCTP_FROM_SCTP_USRREQ + SCTP_LOC_1); } else { timer_stopped = 0; } /* Update the path MTU. */ if (net->port) { next_mtu -= sizeof(struct udphdr); } if (net->mtu > next_mtu) { net->mtu = next_mtu; if (net->port) { sctp_hc_set_mtu(&net->ro._l_addr, inp->fibnum, next_mtu + sizeof(struct udphdr)); } else { sctp_hc_set_mtu(&net->ro._l_addr, inp->fibnum, next_mtu); } } /* Update the association MTU */ if (stcb->asoc.smallest_mtu > next_mtu) { sctp_pathmtu_adjustment(stcb, next_mtu); } /* Finally, start the PMTU timer if it was running before. */ if (timer_stopped) { sctp_timer_start(SCTP_TIMER_TYPE_PATHMTURAISE, inp, stcb, net); } SCTP_TCB_UNLOCK(stcb); } else { SCTP_TCB_UNLOCK(stcb); } } void sctp_ctlinput(int cmd, struct sockaddr *sa, void *vip) { struct ip *outer_ip; struct ip *inner_ip; struct sctphdr *sh; struct icmp *icmp; struct sctp_inpcb *inp; struct sctp_tcb *stcb; struct sctp_nets *net; struct sctp_init_chunk *ch; struct sockaddr_in src, dst; if (sa->sa_family != AF_INET || ((struct sockaddr_in *)sa)->sin_addr.s_addr == INADDR_ANY) { return; } if (PRC_IS_REDIRECT(cmd)) { vip = NULL; } else if ((unsigned)cmd >= PRC_NCMDS || inetctlerrmap[cmd] == 0) { return; } if (vip != NULL) { inner_ip = (struct ip *)vip; icmp = (struct icmp *)((caddr_t)inner_ip - (sizeof(struct icmp) - sizeof(struct ip))); outer_ip = (struct ip *)((caddr_t)icmp - sizeof(struct ip)); sh = (struct sctphdr *)((caddr_t)inner_ip + (inner_ip->ip_hl << 2)); memset(&src, 0, sizeof(struct sockaddr_in)); src.sin_family = AF_INET; src.sin_len = sizeof(struct sockaddr_in); src.sin_port = sh->src_port; src.sin_addr = inner_ip->ip_src; memset(&dst, 0, sizeof(struct sockaddr_in)); dst.sin_family = AF_INET; dst.sin_len = sizeof(struct sockaddr_in); dst.sin_port = sh->dest_port; dst.sin_addr = inner_ip->ip_dst; /* * 'dst' holds the dest of the packet that failed to be * sent. 'src' holds our local endpoint address. Thus we * reverse the dst and the src in the lookup. */ inp = NULL; net = NULL; stcb = sctp_findassociation_addr_sa((struct sockaddr *)&dst, (struct sockaddr *)&src, &inp, &net, 1, SCTP_DEFAULT_VRFID); if ((stcb != NULL) && (net != NULL) && (inp != NULL)) { /* Check the verification tag */ if (ntohl(sh->v_tag) != 0) { /* * This must be the verification tag used * for sending out packets. We don't * consider packets reflecting the * verification tag. */ if (ntohl(sh->v_tag) != stcb->asoc.peer_vtag) { SCTP_TCB_UNLOCK(stcb); return; } } else { if (ntohs(outer_ip->ip_len) >= sizeof(struct ip) + 8 + (inner_ip->ip_hl << 2) + 20) { /* * In this case we can check if we * got an INIT chunk and if the * initiate tag matches. */ ch = (struct sctp_init_chunk *)(sh + 1); if ((ch->ch.chunk_type != SCTP_INITIATION) || (ntohl(ch->init.initiate_tag) != stcb->asoc.my_vtag)) { SCTP_TCB_UNLOCK(stcb); return; } } else { SCTP_TCB_UNLOCK(stcb); return; } } sctp_notify(inp, stcb, net, icmp->icmp_type, icmp->icmp_code, ntohs(inner_ip->ip_len), (uint32_t)ntohs(icmp->icmp_nextmtu)); } else { if ((stcb == NULL) && (inp != NULL)) { /* reduce ref-count */ SCTP_INP_WLOCK(inp); SCTP_INP_DECR_REF(inp); SCTP_INP_WUNLOCK(inp); } if (stcb) { SCTP_TCB_UNLOCK(stcb); } } } return; } #endif static int sctp_getcred(SYSCTL_HANDLER_ARGS) { struct xucred xuc; struct sockaddr_in addrs[2]; struct sctp_inpcb *inp; struct sctp_nets *net; struct sctp_tcb *stcb; int error; uint32_t vrf_id; /* FIX, for non-bsd is this right? */ vrf_id = SCTP_DEFAULT_VRFID; error = priv_check(req->td, PRIV_NETINET_GETCRED); if (error) return (error); error = SYSCTL_IN(req, addrs, sizeof(addrs)); if (error) return (error); stcb = sctp_findassociation_addr_sa(sintosa(&addrs[1]), sintosa(&addrs[0]), &inp, &net, 1, vrf_id); if (stcb == NULL || inp == NULL || inp->sctp_socket == NULL) { if ((inp != NULL) && (stcb == NULL)) { /* reduce ref-count */ SCTP_INP_WLOCK(inp); SCTP_INP_DECR_REF(inp); goto cred_can_cont; } SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOENT); error = ENOENT; goto out; } SCTP_TCB_UNLOCK(stcb); /* * We use the write lock here, only since in the error leg we need * it. If we used RLOCK, then we would have to * wlock/decr/unlock/rlock. Which in theory could create a hole. * Better to use higher wlock. */ SCTP_INP_WLOCK(inp); cred_can_cont: error = cr_canseesocket(req->td->td_ucred, inp->sctp_socket); if (error) { SCTP_INP_WUNLOCK(inp); goto out; } cru2x(inp->sctp_socket->so_cred, &xuc); SCTP_INP_WUNLOCK(inp); error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred)); out: return (error); } SYSCTL_PROC(_net_inet_sctp, OID_AUTO, getcred, CTLTYPE_OPAQUE | CTLFLAG_RW, 0, 0, sctp_getcred, "S,ucred", "Get the ucred of a SCTP connection"); #ifdef INET static void sctp_abort(struct socket *so) { struct sctp_inpcb *inp; uint32_t flags; inp = (struct sctp_inpcb *)so->so_pcb; if (inp == NULL) { return; } sctp_must_try_again: flags = inp->sctp_flags; #ifdef SCTP_LOG_CLOSING sctp_log_closing(inp, NULL, 17); #endif if (((flags & SCTP_PCB_FLAGS_SOCKET_GONE) == 0) && (atomic_cmpset_int(&inp->sctp_flags, flags, (flags | SCTP_PCB_FLAGS_SOCKET_GONE | SCTP_PCB_FLAGS_CLOSE_IP)))) { #ifdef SCTP_LOG_CLOSING sctp_log_closing(inp, NULL, 16); #endif sctp_inpcb_free(inp, SCTP_FREE_SHOULD_USE_ABORT, SCTP_CALLED_AFTER_CMPSET_OFCLOSE); SOCK_LOCK(so); SCTP_SB_CLEAR(so->so_snd); /* * same for the rcv ones, they are only here for the * accounting/select. */ SCTP_SB_CLEAR(so->so_rcv); /* Now null out the reference, we are completely detached. */ so->so_pcb = NULL; SOCK_UNLOCK(so); } else { flags = inp->sctp_flags; if ((flags & SCTP_PCB_FLAGS_SOCKET_GONE) == 0) { goto sctp_must_try_again; } } return; } static int sctp_attach(struct socket *so, int proto SCTP_UNUSED, struct thread *p SCTP_UNUSED) { struct sctp_inpcb *inp; struct inpcb *ip_inp; int error; uint32_t vrf_id = SCTP_DEFAULT_VRFID; inp = (struct sctp_inpcb *)so->so_pcb; if (inp != NULL) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); return (EINVAL); } if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) { error = SCTP_SORESERVE(so, SCTP_BASE_SYSCTL(sctp_sendspace), SCTP_BASE_SYSCTL(sctp_recvspace)); if (error) { return (error); } } error = sctp_inpcb_alloc(so, vrf_id); if (error) { return (error); } inp = (struct sctp_inpcb *)so->so_pcb; SCTP_INP_WLOCK(inp); inp->sctp_flags &= ~SCTP_PCB_FLAGS_BOUND_V6; /* I'm not v6! */ ip_inp = &inp->ip_inp.inp; ip_inp->inp_vflag |= INP_IPV4; ip_inp->inp_ip_ttl = MODULE_GLOBAL(ip_defttl); SCTP_INP_WUNLOCK(inp); return (0); } static int sctp_bind(struct socket *so, struct sockaddr *addr, struct thread *p) { struct sctp_inpcb *inp; inp = (struct sctp_inpcb *)so->so_pcb; if (inp == NULL) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); return (EINVAL); } if (addr != NULL) { if ((addr->sa_family != AF_INET) || (addr->sa_len != sizeof(struct sockaddr_in))) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); return (EINVAL); } } return (sctp_inpcb_bind(so, addr, NULL, p)); } #endif void sctp_close(struct socket *so) { struct sctp_inpcb *inp; uint32_t flags; inp = (struct sctp_inpcb *)so->so_pcb; if (inp == NULL) return; /* * Inform all the lower layer assoc that we are done. */ sctp_must_try_again: flags = inp->sctp_flags; #ifdef SCTP_LOG_CLOSING sctp_log_closing(inp, NULL, 17); #endif if (((flags & SCTP_PCB_FLAGS_SOCKET_GONE) == 0) && (atomic_cmpset_int(&inp->sctp_flags, flags, (flags | SCTP_PCB_FLAGS_SOCKET_GONE | SCTP_PCB_FLAGS_CLOSE_IP)))) { if (((so->so_options & SO_LINGER) && (so->so_linger == 0)) || (so->so_rcv.sb_cc > 0)) { #ifdef SCTP_LOG_CLOSING sctp_log_closing(inp, NULL, 13); #endif sctp_inpcb_free(inp, SCTP_FREE_SHOULD_USE_ABORT, SCTP_CALLED_AFTER_CMPSET_OFCLOSE); } else { #ifdef SCTP_LOG_CLOSING sctp_log_closing(inp, NULL, 14); #endif sctp_inpcb_free(inp, SCTP_FREE_SHOULD_USE_GRACEFUL_CLOSE, SCTP_CALLED_AFTER_CMPSET_OFCLOSE); } /* * The socket is now detached, no matter what the state of * the SCTP association. */ SOCK_LOCK(so); SCTP_SB_CLEAR(so->so_snd); /* * same for the rcv ones, they are only here for the * accounting/select. */ SCTP_SB_CLEAR(so->so_rcv); /* Now null out the reference, we are completely detached. */ so->so_pcb = NULL; SOCK_UNLOCK(so); } else { flags = inp->sctp_flags; if ((flags & SCTP_PCB_FLAGS_SOCKET_GONE) == 0) { goto sctp_must_try_again; } } return; } int sctp_sendm(struct socket *so, int flags, struct mbuf *m, struct sockaddr *addr, struct mbuf *control, struct thread *p); int sctp_sendm(struct socket *so, int flags, struct mbuf *m, struct sockaddr *addr, struct mbuf *control, struct thread *p) { struct sctp_inpcb *inp; int error; inp = (struct sctp_inpcb *)so->so_pcb; if (inp == NULL) { if (control) { sctp_m_freem(control); control = NULL; } SCTP_LTRACE_ERR_RET_PKT(m, inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); sctp_m_freem(m); return (EINVAL); } /* Got to have an to address if we are NOT a connected socket */ if ((addr == NULL) && ((inp->sctp_flags & SCTP_PCB_FLAGS_CONNECTED) || (inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE))) { goto connected_type; } else if (addr == NULL) { SCTP_LTRACE_ERR_RET_PKT(m, inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EDESTADDRREQ); error = EDESTADDRREQ; sctp_m_freem(m); if (control) { sctp_m_freem(control); control = NULL; } return (error); } #ifdef INET6 if (addr->sa_family != AF_INET) { /* must be a v4 address! */ SCTP_LTRACE_ERR_RET_PKT(m, inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EDESTADDRREQ); sctp_m_freem(m); if (control) { sctp_m_freem(control); control = NULL; } error = EDESTADDRREQ; return (error); } #endif /* INET6 */ connected_type: /* now what about control */ if (control) { if (inp->control) { SCTP_PRINTF("huh? control set?\n"); sctp_m_freem(inp->control); inp->control = NULL; } inp->control = control; } /* Place the data */ if (inp->pkt) { SCTP_BUF_NEXT(inp->pkt_last) = m; inp->pkt_last = m; } else { inp->pkt_last = inp->pkt = m; } if ( /* FreeBSD uses a flag passed */ ((flags & PRUS_MORETOCOME) == 0) ) { /* * note with the current version this code will only be used * by OpenBSD-- NetBSD, FreeBSD, and MacOS have methods for * re-defining sosend to use the sctp_sosend. One can * optionally switch back to this code (by changing back the * definitions) but this is not advisable. This code is used * by FreeBSD when sending a file with sendfile() though. */ int ret; ret = sctp_output(inp, inp->pkt, addr, inp->control, p, flags); inp->pkt = NULL; inp->control = NULL; return (ret); } else { return (0); } } int sctp_disconnect(struct socket *so) { struct sctp_inpcb *inp; inp = (struct sctp_inpcb *)so->so_pcb; if (inp == NULL) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOTCONN); return (ENOTCONN); } SCTP_INP_RLOCK(inp); if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL)) { if (LIST_EMPTY(&inp->sctp_asoc_list)) { /* No connection */ SCTP_INP_RUNLOCK(inp); return (0); } else { struct sctp_association *asoc; struct sctp_tcb *stcb; stcb = LIST_FIRST(&inp->sctp_asoc_list); if (stcb == NULL) { SCTP_INP_RUNLOCK(inp); SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); return (EINVAL); } SCTP_TCB_LOCK(stcb); asoc = &stcb->asoc; if (stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) { /* We are about to be freed, out of here */ SCTP_TCB_UNLOCK(stcb); SCTP_INP_RUNLOCK(inp); return (0); } if (((so->so_options & SO_LINGER) && (so->so_linger == 0)) || (so->so_rcv.sb_cc > 0)) { if (SCTP_GET_STATE(asoc) != SCTP_STATE_COOKIE_WAIT) { /* Left with Data unread */ struct mbuf *err; err = sctp_get_mbuf_for_msg(sizeof(struct sctp_paramhdr), 0, M_NOWAIT, 1, MT_DATA); if (err) { /* * Fill in the user * initiated abort */ struct sctp_paramhdr *ph; ph = mtod(err, struct sctp_paramhdr *); SCTP_BUF_LEN(err) = sizeof(struct sctp_paramhdr); ph->param_type = htons(SCTP_CAUSE_USER_INITIATED_ABT); ph->param_length = htons(SCTP_BUF_LEN(err)); } sctp_send_abort_tcb(stcb, err, SCTP_SO_LOCKED); SCTP_STAT_INCR_COUNTER32(sctps_aborted); } SCTP_INP_RUNLOCK(inp); if ((SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_OPEN) || (SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_SHUTDOWN_RECEIVED)) { SCTP_STAT_DECR_GAUGE32(sctps_currestab); } (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTP_USRREQ + SCTP_LOC_3); /* No unlock tcb assoc is gone */ return (0); } if (TAILQ_EMPTY(&asoc->send_queue) && TAILQ_EMPTY(&asoc->sent_queue) && (asoc->stream_queue_cnt == 0)) { /* there is nothing queued to send, so done */ if ((*asoc->ss_functions.sctp_ss_is_user_msgs_incomplete) (stcb, asoc)) { goto abort_anyway; } if ((SCTP_GET_STATE(asoc) != SCTP_STATE_SHUTDOWN_SENT) && (SCTP_GET_STATE(asoc) != SCTP_STATE_SHUTDOWN_ACK_SENT)) { /* only send SHUTDOWN 1st time thru */ struct sctp_nets *netp; if ((SCTP_GET_STATE(asoc) == SCTP_STATE_OPEN) || (SCTP_GET_STATE(asoc) == SCTP_STATE_SHUTDOWN_RECEIVED)) { SCTP_STAT_DECR_GAUGE32(sctps_currestab); } SCTP_SET_STATE(asoc, SCTP_STATE_SHUTDOWN_SENT); SCTP_CLEAR_SUBSTATE(asoc, SCTP_STATE_SHUTDOWN_PENDING); sctp_stop_timers_for_shutdown(stcb); if (stcb->asoc.alternate) { netp = stcb->asoc.alternate; } else { netp = stcb->asoc.primary_destination; } sctp_send_shutdown(stcb, netp); sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWN, stcb->sctp_ep, stcb, netp); sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWNGUARD, stcb->sctp_ep, stcb, netp); sctp_chunk_output(stcb->sctp_ep, stcb, SCTP_OUTPUT_FROM_T3, SCTP_SO_LOCKED); } } else { /* * we still got (or just got) data to send, * so set SHUTDOWN_PENDING */ /* * XXX sockets draft says that SCTP_EOF * should be sent with no data. currently, * we will allow user data to be sent first * and move to SHUTDOWN-PENDING */ struct sctp_nets *netp; if (stcb->asoc.alternate) { netp = stcb->asoc.alternate; } else { netp = stcb->asoc.primary_destination; } asoc->state |= SCTP_STATE_SHUTDOWN_PENDING; sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWNGUARD, stcb->sctp_ep, stcb, netp); if ((*asoc->ss_functions.sctp_ss_is_user_msgs_incomplete) (stcb, asoc)) { asoc->state |= SCTP_STATE_PARTIAL_MSG_LEFT; } if (TAILQ_EMPTY(&asoc->send_queue) && TAILQ_EMPTY(&asoc->sent_queue) && (asoc->state & SCTP_STATE_PARTIAL_MSG_LEFT)) { struct mbuf *op_err; abort_anyway: op_err = sctp_generate_cause(SCTP_CAUSE_USER_INITIATED_ABT, ""); stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_USRREQ + SCTP_LOC_4; sctp_send_abort_tcb(stcb, op_err, SCTP_SO_LOCKED); SCTP_STAT_INCR_COUNTER32(sctps_aborted); if ((SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_OPEN) || (SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_SHUTDOWN_RECEIVED)) { SCTP_STAT_DECR_GAUGE32(sctps_currestab); } SCTP_INP_RUNLOCK(inp); (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTP_USRREQ + SCTP_LOC_5); return (0); } else { sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_CLOSING, SCTP_SO_LOCKED); } } soisdisconnecting(so); SCTP_TCB_UNLOCK(stcb); SCTP_INP_RUNLOCK(inp); return (0); } /* not reached */ } else { /* UDP model does not support this */ SCTP_INP_RUNLOCK(inp); SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EOPNOTSUPP); return (EOPNOTSUPP); } } int sctp_flush(struct socket *so, int how) { /* * We will just clear out the values and let subsequent close clear * out the data, if any. Note if the user did a shutdown(SHUT_RD) * they will not be able to read the data, the socket will block * that from happening. */ struct sctp_inpcb *inp; inp = (struct sctp_inpcb *)so->so_pcb; if (inp == NULL) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); return (EINVAL); } SCTP_INP_RLOCK(inp); /* For the 1 to many model this does nothing */ if (inp->sctp_flags & SCTP_PCB_FLAGS_UDPTYPE) { SCTP_INP_RUNLOCK(inp); return (0); } SCTP_INP_RUNLOCK(inp); if ((how == PRU_FLUSH_RD) || (how == PRU_FLUSH_RDWR)) { /* * First make sure the sb will be happy, we don't use these * except maybe the count */ SCTP_INP_WLOCK(inp); SCTP_INP_READ_LOCK(inp); inp->sctp_flags |= SCTP_PCB_FLAGS_SOCKET_CANT_READ; SCTP_INP_READ_UNLOCK(inp); SCTP_INP_WUNLOCK(inp); so->so_rcv.sb_cc = 0; so->so_rcv.sb_mbcnt = 0; so->so_rcv.sb_mb = NULL; } if ((how == PRU_FLUSH_WR) || (how == PRU_FLUSH_RDWR)) { /* * First make sure the sb will be happy, we don't use these * except maybe the count */ so->so_snd.sb_cc = 0; so->so_snd.sb_mbcnt = 0; so->so_snd.sb_mb = NULL; } return (0); } int sctp_shutdown(struct socket *so) { struct sctp_inpcb *inp; inp = (struct sctp_inpcb *)so->so_pcb; if (inp == NULL) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); return (EINVAL); } SCTP_INP_RLOCK(inp); /* For UDP model this is a invalid call */ if (!((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL))) { /* Restore the flags that the soshutdown took away. */ SOCKBUF_LOCK(&so->so_rcv); so->so_rcv.sb_state &= ~SBS_CANTRCVMORE; SOCKBUF_UNLOCK(&so->so_rcv); /* This proc will wakeup for read and do nothing (I hope) */ SCTP_INP_RUNLOCK(inp); SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EOPNOTSUPP); return (EOPNOTSUPP); } else { /* * Ok, if we reach here its the TCP model and it is either a * SHUT_WR or SHUT_RDWR. This means we put the shutdown flag * against it. */ struct sctp_tcb *stcb; struct sctp_association *asoc; struct sctp_nets *netp; if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) == 0) { SCTP_INP_RUNLOCK(inp); return (ENOTCONN); } socantsendmore(so); stcb = LIST_FIRST(&inp->sctp_asoc_list); if (stcb == NULL) { /* * Ok, we hit the case that the shutdown call was * made after an abort or something. Nothing to do * now. */ SCTP_INP_RUNLOCK(inp); return (0); } SCTP_TCB_LOCK(stcb); asoc = &stcb->asoc; if (asoc->state & SCTP_STATE_ABOUT_TO_BE_FREED) { SCTP_TCB_UNLOCK(stcb); SCTP_INP_RUNLOCK(inp); return (0); } if ((SCTP_GET_STATE(asoc) != SCTP_STATE_COOKIE_WAIT) && (SCTP_GET_STATE(asoc) != SCTP_STATE_COOKIE_ECHOED) && (SCTP_GET_STATE(asoc) != SCTP_STATE_OPEN)) { /* * If we are not in or before ESTABLISHED, there is * no protocol action required. */ SCTP_TCB_UNLOCK(stcb); SCTP_INP_RUNLOCK(inp); return (0); } if (stcb->asoc.alternate) { netp = stcb->asoc.alternate; } else { netp = stcb->asoc.primary_destination; } if ((SCTP_GET_STATE(asoc) == SCTP_STATE_OPEN) && TAILQ_EMPTY(&asoc->send_queue) && TAILQ_EMPTY(&asoc->sent_queue) && (asoc->stream_queue_cnt == 0)) { if ((*asoc->ss_functions.sctp_ss_is_user_msgs_incomplete) (stcb, asoc)) { goto abort_anyway; } /* there is nothing queued to send, so I'm done... */ SCTP_STAT_DECR_GAUGE32(sctps_currestab); SCTP_SET_STATE(asoc, SCTP_STATE_SHUTDOWN_SENT); SCTP_CLEAR_SUBSTATE(asoc, SCTP_STATE_SHUTDOWN_PENDING); sctp_stop_timers_for_shutdown(stcb); sctp_send_shutdown(stcb, netp); sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWN, stcb->sctp_ep, stcb, netp); } else { /* * We still got (or just got) data to send, so set * SHUTDOWN_PENDING. */ SCTP_ADD_SUBSTATE(asoc, SCTP_STATE_SHUTDOWN_PENDING); if ((*asoc->ss_functions.sctp_ss_is_user_msgs_incomplete) (stcb, asoc)) { SCTP_ADD_SUBSTATE(asoc, SCTP_STATE_PARTIAL_MSG_LEFT); } if (TAILQ_EMPTY(&asoc->send_queue) && TAILQ_EMPTY(&asoc->sent_queue) && (asoc->state & SCTP_STATE_PARTIAL_MSG_LEFT)) { struct mbuf *op_err; abort_anyway: op_err = sctp_generate_cause(SCTP_CAUSE_USER_INITIATED_ABT, ""); stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_USRREQ + SCTP_LOC_6; sctp_abort_an_association(stcb->sctp_ep, stcb, op_err, SCTP_SO_LOCKED); SCTP_INP_RUNLOCK(inp); return (0); } } sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWNGUARD, stcb->sctp_ep, stcb, netp); /* * XXX: Why do this in the case where we have still data * queued? */ sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_CLOSING, SCTP_SO_LOCKED); SCTP_TCB_UNLOCK(stcb); SCTP_INP_RUNLOCK(inp); return (0); } } /* * copies a "user" presentable address and removes embedded scope, etc. * returns 0 on success, 1 on error */ static uint32_t sctp_fill_user_address(struct sockaddr_storage *ss, struct sockaddr *sa) { #ifdef INET6 struct sockaddr_in6 lsa6; sa = (struct sockaddr *)sctp_recover_scope((struct sockaddr_in6 *)sa, &lsa6); #endif memcpy(ss, sa, sa->sa_len); return (0); } /* * NOTE: assumes addr lock is held */ static size_t sctp_fill_up_addresses_vrf(struct sctp_inpcb *inp, struct sctp_tcb *stcb, size_t limit, struct sockaddr_storage *sas, uint32_t vrf_id) { struct sctp_ifn *sctp_ifn; struct sctp_ifa *sctp_ifa; size_t actual; int loopback_scope; #if defined(INET) int ipv4_local_scope, ipv4_addr_legal; #endif #if defined(INET6) int local_scope, site_scope, ipv6_addr_legal; #endif struct sctp_vrf *vrf; actual = 0; if (limit <= 0) return (actual); if (stcb) { /* Turn on all the appropriate scope */ loopback_scope = stcb->asoc.scope.loopback_scope; #if defined(INET) ipv4_local_scope = stcb->asoc.scope.ipv4_local_scope; ipv4_addr_legal = stcb->asoc.scope.ipv4_addr_legal; #endif #if defined(INET6) local_scope = stcb->asoc.scope.local_scope; site_scope = stcb->asoc.scope.site_scope; ipv6_addr_legal = stcb->asoc.scope.ipv6_addr_legal; #endif } else { /* Use generic values for endpoints. */ loopback_scope = 1; #if defined(INET) ipv4_local_scope = 1; #endif #if defined(INET6) local_scope = 1; site_scope = 1; #endif if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) { #if defined(INET6) ipv6_addr_legal = 1; #endif #if defined(INET) if (SCTP_IPV6_V6ONLY(inp)) { ipv4_addr_legal = 0; } else { ipv4_addr_legal = 1; } #endif } else { #if defined(INET6) ipv6_addr_legal = 0; #endif #if defined(INET) ipv4_addr_legal = 1; #endif } } vrf = sctp_find_vrf(vrf_id); if (vrf == NULL) { return (0); } if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) { LIST_FOREACH(sctp_ifn, &vrf->ifnlist, next_ifn) { if ((loopback_scope == 0) && SCTP_IFN_IS_IFT_LOOP(sctp_ifn)) { /* Skip loopback if loopback_scope not set */ continue; } LIST_FOREACH(sctp_ifa, &sctp_ifn->ifalist, next_ifa) { if (stcb) { /* * For the BOUND-ALL case, the list * associated with a TCB is Always * considered a reverse list.. i.e. * it lists addresses that are NOT * part of the association. If this * is one of those we must skip it. */ if (sctp_is_addr_restricted(stcb, sctp_ifa)) { continue; } } switch (sctp_ifa->address.sa.sa_family) { #ifdef INET case AF_INET: if (ipv4_addr_legal) { struct sockaddr_in *sin; sin = &sctp_ifa->address.sin; if (sin->sin_addr.s_addr == 0) { /* * we skip * unspecifed * addresses */ continue; } if (prison_check_ip4(inp->ip_inp.inp.inp_cred, &sin->sin_addr) != 0) { continue; } if ((ipv4_local_scope == 0) && (IN4_ISPRIVATE_ADDRESS(&sin->sin_addr))) { continue; } #ifdef INET6 if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_NEEDS_MAPPED_V4)) { in6_sin_2_v4mapsin6(sin, (struct sockaddr_in6 *)sas); ((struct sockaddr_in6 *)sas)->sin6_port = inp->sctp_lport; sas = (struct sockaddr_storage *)((caddr_t)sas + sizeof(struct sockaddr_in6)); actual += sizeof(struct sockaddr_in6); } else { #endif memcpy(sas, sin, sizeof(*sin)); ((struct sockaddr_in *)sas)->sin_port = inp->sctp_lport; sas = (struct sockaddr_storage *)((caddr_t)sas + sizeof(*sin)); actual += sizeof(*sin); #ifdef INET6 } #endif if (actual >= limit) { return (actual); } } else { continue; } break; #endif #ifdef INET6 case AF_INET6: if (ipv6_addr_legal) { struct sockaddr_in6 *sin6; sin6 = &sctp_ifa->address.sin6; if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) { /* * we skip * unspecifed * addresses */ continue; } if (prison_check_ip6(inp->ip_inp.inp.inp_cred, &sin6->sin6_addr) != 0) { continue; } if (IN6_IS_ADDR_LINKLOCAL(&sin6->sin6_addr)) { if (local_scope == 0) continue; if (sin6->sin6_scope_id == 0) { if (sa6_recoverscope(sin6) != 0) /* * * bad * link * * local * * address */ continue; } } if ((site_scope == 0) && (IN6_IS_ADDR_SITELOCAL(&sin6->sin6_addr))) { continue; } memcpy(sas, sin6, sizeof(*sin6)); ((struct sockaddr_in6 *)sas)->sin6_port = inp->sctp_lport; sas = (struct sockaddr_storage *)((caddr_t)sas + sizeof(*sin6)); actual += sizeof(*sin6); if (actual >= limit) { return (actual); } } else { continue; } break; #endif default: /* TSNH */ break; } } } } else { struct sctp_laddr *laddr; LIST_FOREACH(laddr, &inp->sctp_addr_list, sctp_nxt_addr) { if (stcb) { if (sctp_is_addr_restricted(stcb, laddr->ifa)) { continue; } } if (sctp_fill_user_address(sas, &laddr->ifa->address.sa)) continue; switch (laddr->ifa->address.sa.sa_family) { #ifdef INET case AF_INET: ((struct sockaddr_in *)sas)->sin_port = inp->sctp_lport; break; #endif #ifdef INET6 case AF_INET6: ((struct sockaddr_in6 *)sas)->sin6_port = inp->sctp_lport; break; #endif default: /* TSNH */ break; } sas = (struct sockaddr_storage *)((caddr_t)sas + laddr->ifa->address.sa.sa_len); actual += laddr->ifa->address.sa.sa_len; if (actual >= limit) { return (actual); } } } return (actual); } static size_t sctp_fill_up_addresses(struct sctp_inpcb *inp, struct sctp_tcb *stcb, size_t limit, struct sockaddr_storage *sas) { size_t size = 0; SCTP_IPI_ADDR_RLOCK(); /* fill up addresses for the endpoint's default vrf */ size = sctp_fill_up_addresses_vrf(inp, stcb, limit, sas, inp->def_vrf_id); SCTP_IPI_ADDR_RUNLOCK(); return (size); } /* * NOTE: assumes addr lock is held */ static int sctp_count_max_addresses_vrf(struct sctp_inpcb *inp, uint32_t vrf_id) { int cnt = 0; struct sctp_vrf *vrf = NULL; /* * In both sub-set bound an bound_all cases we return the MAXIMUM * number of addresses that you COULD get. In reality the sub-set * bound may have an exclusion list for a given TCB OR in the * bound-all case a TCB may NOT include the loopback or other * addresses as well. */ vrf = sctp_find_vrf(vrf_id); if (vrf == NULL) { return (0); } if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) { struct sctp_ifn *sctp_ifn; struct sctp_ifa *sctp_ifa; LIST_FOREACH(sctp_ifn, &vrf->ifnlist, next_ifn) { LIST_FOREACH(sctp_ifa, &sctp_ifn->ifalist, next_ifa) { /* Count them if they are the right type */ switch (sctp_ifa->address.sa.sa_family) { #ifdef INET case AF_INET: #ifdef INET6 if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_NEEDS_MAPPED_V4)) cnt += sizeof(struct sockaddr_in6); else cnt += sizeof(struct sockaddr_in); #else cnt += sizeof(struct sockaddr_in); #endif break; #endif #ifdef INET6 case AF_INET6: cnt += sizeof(struct sockaddr_in6); break; #endif default: break; } } } } else { struct sctp_laddr *laddr; LIST_FOREACH(laddr, &inp->sctp_addr_list, sctp_nxt_addr) { switch (laddr->ifa->address.sa.sa_family) { #ifdef INET case AF_INET: #ifdef INET6 if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_NEEDS_MAPPED_V4)) cnt += sizeof(struct sockaddr_in6); else cnt += sizeof(struct sockaddr_in); #else cnt += sizeof(struct sockaddr_in); #endif break; #endif #ifdef INET6 case AF_INET6: cnt += sizeof(struct sockaddr_in6); break; #endif default: break; } } } return (cnt); } static int sctp_count_max_addresses(struct sctp_inpcb *inp) { int cnt = 0; SCTP_IPI_ADDR_RLOCK(); /* count addresses for the endpoint's default VRF */ cnt = sctp_count_max_addresses_vrf(inp, inp->def_vrf_id); SCTP_IPI_ADDR_RUNLOCK(); return (cnt); } static int sctp_do_connect_x(struct socket *so, struct sctp_inpcb *inp, void *optval, size_t optsize, void *p, int delay) { int error = 0; int creat_lock_on = 0; struct sctp_tcb *stcb = NULL; struct sockaddr *sa; unsigned int num_v6 = 0, num_v4 = 0, *totaddrp, totaddr; uint32_t vrf_id; int bad_addresses = 0; sctp_assoc_t *a_id; SCTPDBG(SCTP_DEBUG_PCB1, "Connectx called\n"); if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) && (inp->sctp_flags & SCTP_PCB_FLAGS_CONNECTED)) { /* We are already connected AND the TCP model */ SCTP_LTRACE_ERR_RET(inp, stcb, NULL, SCTP_FROM_SCTP_USRREQ, EADDRINUSE); return (EADDRINUSE); } if ((inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) && (sctp_is_feature_off(inp, SCTP_PCB_FLAGS_PORTREUSE))) { SCTP_LTRACE_ERR_RET(inp, stcb, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); return (EINVAL); } if (inp->sctp_flags & SCTP_PCB_FLAGS_CONNECTED) { SCTP_INP_RLOCK(inp); stcb = LIST_FIRST(&inp->sctp_asoc_list); SCTP_INP_RUNLOCK(inp); } if (stcb) { SCTP_LTRACE_ERR_RET(inp, stcb, NULL, SCTP_FROM_SCTP_USRREQ, EALREADY); return (EALREADY); } SCTP_INP_INCR_REF(inp); SCTP_ASOC_CREATE_LOCK(inp); creat_lock_on = 1; if ((inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) || (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE)) { SCTP_LTRACE_ERR_RET(inp, stcb, NULL, SCTP_FROM_SCTP_USRREQ, EFAULT); error = EFAULT; goto out_now; } totaddrp = (unsigned int *)optval; totaddr = *totaddrp; sa = (struct sockaddr *)(totaddrp + 1); stcb = sctp_connectx_helper_find(inp, sa, &totaddr, &num_v4, &num_v6, &error, (unsigned int)(optsize - sizeof(int)), &bad_addresses); if ((stcb != NULL) || bad_addresses) { /* Already have or am bring up an association */ SCTP_ASOC_CREATE_UNLOCK(inp); creat_lock_on = 0; if (stcb) SCTP_TCB_UNLOCK(stcb); if (bad_addresses == 0) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EALREADY); error = EALREADY; } goto out_now; } #ifdef INET6 if (((inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) == 0) && (num_v6 > 0)) { error = EINVAL; goto out_now; } if ((inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) && (num_v4 > 0)) { struct in6pcb *inp6; inp6 = (struct in6pcb *)inp; if (SCTP_IPV6_V6ONLY(inp6)) { /* * if IPV6_V6ONLY flag, ignore connections destined * to a v4 addr or v4-mapped addr */ SCTP_LTRACE_ERR_RET(inp, stcb, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; goto out_now; } } #endif /* INET6 */ if ((inp->sctp_flags & SCTP_PCB_FLAGS_UNBOUND) == SCTP_PCB_FLAGS_UNBOUND) { /* Bind a ephemeral port */ error = sctp_inpcb_bind(so, NULL, NULL, p); if (error) { goto out_now; } } /* FIX ME: do we want to pass in a vrf on the connect call? */ vrf_id = inp->def_vrf_id; /* We are GOOD to go */ stcb = sctp_aloc_assoc(inp, sa, &error, 0, vrf_id, inp->sctp_ep.pre_open_stream_count, inp->sctp_ep.port, (struct thread *)p ); if (stcb == NULL) { /* Gak! no memory */ goto out_now; } if (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) { stcb->sctp_ep->sctp_flags |= SCTP_PCB_FLAGS_CONNECTED; /* Set the connected flag so we can queue data */ soisconnecting(so); } SCTP_SET_STATE(&stcb->asoc, SCTP_STATE_COOKIE_WAIT); /* move to second address */ switch (sa->sa_family) { #ifdef INET case AF_INET: sa = (struct sockaddr *)((caddr_t)sa + sizeof(struct sockaddr_in)); break; #endif #ifdef INET6 case AF_INET6: sa = (struct sockaddr *)((caddr_t)sa + sizeof(struct sockaddr_in6)); break; #endif default: break; } error = 0; sctp_connectx_helper_add(stcb, sa, (totaddr - 1), &error); /* Fill in the return id */ if (error) { (void)sctp_free_assoc(inp, stcb, SCTP_PCBFREE_FORCE, SCTP_FROM_SCTP_USRREQ + SCTP_LOC_7); goto out_now; } a_id = (sctp_assoc_t *)optval; *a_id = sctp_get_associd(stcb); /* initialize authentication parameters for the assoc */ sctp_initialize_auth_params(inp, stcb); if (delay) { /* doing delayed connection */ stcb->asoc.delayed_connection = 1; sctp_timer_start(SCTP_TIMER_TYPE_INIT, inp, stcb, stcb->asoc.primary_destination); } else { (void)SCTP_GETTIME_TIMEVAL(&stcb->asoc.time_entered); sctp_send_initiate(inp, stcb, SCTP_SO_LOCKED); } SCTP_TCB_UNLOCK(stcb); out_now: if (creat_lock_on) { SCTP_ASOC_CREATE_UNLOCK(inp); } SCTP_INP_DECR_REF(inp); return (error); } #define SCTP_FIND_STCB(inp, stcb, assoc_id) { \ if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||\ (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL)) { \ SCTP_INP_RLOCK(inp); \ stcb = LIST_FIRST(&inp->sctp_asoc_list); \ if (stcb) { \ SCTP_TCB_LOCK(stcb); \ } \ SCTP_INP_RUNLOCK(inp); \ } else if (assoc_id > SCTP_ALL_ASSOC) { \ stcb = sctp_findassociation_ep_asocid(inp, assoc_id, 1); \ if (stcb == NULL) { \ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOENT); \ error = ENOENT; \ break; \ } \ } else { \ stcb = NULL; \ } \ } #define SCTP_CHECK_AND_CAST(destp, srcp, type, size) {\ if (size < sizeof(type)) { \ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); \ error = EINVAL; \ break; \ } else { \ destp = (type *)srcp; \ } \ } static int sctp_getopt(struct socket *so, int optname, void *optval, size_t *optsize, void *p) { struct sctp_inpcb *inp = NULL; int error, val = 0; struct sctp_tcb *stcb = NULL; if (optval == NULL) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); return (EINVAL); } inp = (struct sctp_inpcb *)so->so_pcb; if (inp == NULL) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); return EINVAL; } error = 0; switch (optname) { case SCTP_NODELAY: case SCTP_AUTOCLOSE: case SCTP_EXPLICIT_EOR: case SCTP_AUTO_ASCONF: case SCTP_DISABLE_FRAGMENTS: case SCTP_I_WANT_MAPPED_V4_ADDR: case SCTP_USE_EXT_RCVINFO: SCTP_INP_RLOCK(inp); switch (optname) { case SCTP_DISABLE_FRAGMENTS: val = sctp_is_feature_on(inp, SCTP_PCB_FLAGS_NO_FRAGMENT); break; case SCTP_I_WANT_MAPPED_V4_ADDR: val = sctp_is_feature_on(inp, SCTP_PCB_FLAGS_NEEDS_MAPPED_V4); break; case SCTP_AUTO_ASCONF: if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) { /* only valid for bound all sockets */ val = sctp_is_feature_on(inp, SCTP_PCB_FLAGS_AUTO_ASCONF); } else { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; goto flags_out; } break; case SCTP_EXPLICIT_EOR: val = sctp_is_feature_on(inp, SCTP_PCB_FLAGS_EXPLICIT_EOR); break; case SCTP_NODELAY: val = sctp_is_feature_on(inp, SCTP_PCB_FLAGS_NODELAY); break; case SCTP_USE_EXT_RCVINFO: val = sctp_is_feature_on(inp, SCTP_PCB_FLAGS_EXT_RCVINFO); break; case SCTP_AUTOCLOSE: if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_AUTOCLOSE)) val = TICKS_TO_SEC(inp->sctp_ep.auto_close_time); else val = 0; break; default: SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOPROTOOPT); error = ENOPROTOOPT; } /* end switch (sopt->sopt_name) */ if (*optsize < sizeof(val)) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; } flags_out: SCTP_INP_RUNLOCK(inp); if (error == 0) { /* return the option value */ *(int *)optval = val; *optsize = sizeof(val); } break; case SCTP_GET_PACKET_LOG: { #ifdef SCTP_PACKET_LOGGING uint8_t *target; int ret; SCTP_CHECK_AND_CAST(target, optval, uint8_t, *optsize); ret = sctp_copy_out_packet_log(target, (int)*optsize); *optsize = ret; #else SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EOPNOTSUPP); error = EOPNOTSUPP; #endif break; } case SCTP_REUSE_PORT: { uint32_t *value; if ((inp->sctp_flags & SCTP_PCB_FLAGS_UDPTYPE)) { /* Can't do this for a 1-m socket */ error = EINVAL; break; } SCTP_CHECK_AND_CAST(value, optval, uint32_t, *optsize); *value = sctp_is_feature_on(inp, SCTP_PCB_FLAGS_PORTREUSE); *optsize = sizeof(uint32_t); break; } case SCTP_PARTIAL_DELIVERY_POINT: { uint32_t *value; SCTP_CHECK_AND_CAST(value, optval, uint32_t, *optsize); *value = inp->partial_delivery_point; *optsize = sizeof(uint32_t); break; } case SCTP_FRAGMENT_INTERLEAVE: { uint32_t *value; SCTP_CHECK_AND_CAST(value, optval, uint32_t, *optsize); if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_FRAG_INTERLEAVE)) { if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_INTERLEAVE_STRMS)) { *value = SCTP_FRAG_LEVEL_2; } else { *value = SCTP_FRAG_LEVEL_1; } } else { *value = SCTP_FRAG_LEVEL_0; } *optsize = sizeof(uint32_t); break; } case SCTP_INTERLEAVING_SUPPORTED: { struct sctp_assoc_value *av; SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, *optsize); SCTP_FIND_STCB(inp, stcb, av->assoc_id); if (stcb) { av->assoc_value = stcb->asoc.idata_supported; SCTP_TCB_UNLOCK(stcb); } else { if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) || (av->assoc_id == SCTP_FUTURE_ASSOC)) { SCTP_INP_RLOCK(inp); if (inp->idata_supported) { av->assoc_value = 1; } else { av->assoc_value = 0; } SCTP_INP_RUNLOCK(inp); } else { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; } } if (error == 0) { *optsize = sizeof(struct sctp_assoc_value); } break; } case SCTP_CMT_ON_OFF: { struct sctp_assoc_value *av; SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, *optsize); SCTP_FIND_STCB(inp, stcb, av->assoc_id); if (stcb) { av->assoc_value = stcb->asoc.sctp_cmt_on_off; SCTP_TCB_UNLOCK(stcb); } else { if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) || (av->assoc_id == SCTP_FUTURE_ASSOC)) { SCTP_INP_RLOCK(inp); av->assoc_value = inp->sctp_cmt_on_off; SCTP_INP_RUNLOCK(inp); } else { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; } } if (error == 0) { *optsize = sizeof(struct sctp_assoc_value); } break; } case SCTP_PLUGGABLE_CC: { struct sctp_assoc_value *av; SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, *optsize); SCTP_FIND_STCB(inp, stcb, av->assoc_id); if (stcb) { av->assoc_value = stcb->asoc.congestion_control_module; SCTP_TCB_UNLOCK(stcb); } else { if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) || (av->assoc_id == SCTP_FUTURE_ASSOC)) { SCTP_INP_RLOCK(inp); av->assoc_value = inp->sctp_ep.sctp_default_cc_module; SCTP_INP_RUNLOCK(inp); } else { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; } } if (error == 0) { *optsize = sizeof(struct sctp_assoc_value); } break; } case SCTP_CC_OPTION: { struct sctp_cc_option *cc_opt; SCTP_CHECK_AND_CAST(cc_opt, optval, struct sctp_cc_option, *optsize); SCTP_FIND_STCB(inp, stcb, cc_opt->aid_value.assoc_id); if (stcb == NULL) { error = EINVAL; } else { if (stcb->asoc.cc_functions.sctp_cwnd_socket_option == NULL) { error = ENOTSUP; } else { error = (*stcb->asoc.cc_functions.sctp_cwnd_socket_option) (stcb, 0, cc_opt); *optsize = sizeof(struct sctp_cc_option); } SCTP_TCB_UNLOCK(stcb); } break; } case SCTP_PLUGGABLE_SS: { struct sctp_assoc_value *av; SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, *optsize); SCTP_FIND_STCB(inp, stcb, av->assoc_id); if (stcb) { av->assoc_value = stcb->asoc.stream_scheduling_module; SCTP_TCB_UNLOCK(stcb); } else { if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) || (av->assoc_id == SCTP_FUTURE_ASSOC)) { SCTP_INP_RLOCK(inp); av->assoc_value = inp->sctp_ep.sctp_default_ss_module; SCTP_INP_RUNLOCK(inp); } else { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; } } if (error == 0) { *optsize = sizeof(struct sctp_assoc_value); } break; } case SCTP_SS_VALUE: { struct sctp_stream_value *av; SCTP_CHECK_AND_CAST(av, optval, struct sctp_stream_value, *optsize); SCTP_FIND_STCB(inp, stcb, av->assoc_id); if (stcb) { if ((av->stream_id >= stcb->asoc.streamoutcnt) || (stcb->asoc.ss_functions.sctp_ss_get_value(stcb, &stcb->asoc, &stcb->asoc.strmout[av->stream_id], &av->stream_value) < 0)) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; } else { *optsize = sizeof(struct sctp_stream_value); } SCTP_TCB_UNLOCK(stcb); } else { /* * Can't get stream value without * association */ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; } break; } case SCTP_GET_ADDR_LEN: { struct sctp_assoc_value *av; SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, *optsize); error = EINVAL; #ifdef INET if (av->assoc_value == AF_INET) { av->assoc_value = sizeof(struct sockaddr_in); error = 0; } #endif #ifdef INET6 if (av->assoc_value == AF_INET6) { av->assoc_value = sizeof(struct sockaddr_in6); error = 0; } #endif if (error) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error); } else { *optsize = sizeof(struct sctp_assoc_value); } break; } case SCTP_GET_ASSOC_NUMBER: { uint32_t *value, cnt; SCTP_CHECK_AND_CAST(value, optval, uint32_t, *optsize); SCTP_INP_RLOCK(inp); if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL)) { /* Can't do this for a 1-1 socket */ error = EINVAL; SCTP_INP_RUNLOCK(inp); break; } cnt = 0; LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) { cnt++; } SCTP_INP_RUNLOCK(inp); *value = cnt; *optsize = sizeof(uint32_t); break; } case SCTP_GET_ASSOC_ID_LIST: { struct sctp_assoc_ids *ids; uint32_t at; size_t limit; SCTP_CHECK_AND_CAST(ids, optval, struct sctp_assoc_ids, *optsize); SCTP_INP_RLOCK(inp); if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL)) { /* Can't do this for a 1-1 socket */ error = EINVAL; SCTP_INP_RUNLOCK(inp); break; } at = 0; limit = (*optsize - sizeof(uint32_t)) / sizeof(sctp_assoc_t); LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) { if (at < limit) { ids->gaids_assoc_id[at++] = sctp_get_associd(stcb); if (at == 0) { error = EINVAL; SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error); break; } } else { error = EINVAL; SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error); break; } } SCTP_INP_RUNLOCK(inp); if (error == 0) { ids->gaids_number_of_ids = at; *optsize = ((at * sizeof(sctp_assoc_t)) + sizeof(uint32_t)); } break; } case SCTP_CONTEXT: { struct sctp_assoc_value *av; SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, *optsize); SCTP_FIND_STCB(inp, stcb, av->assoc_id); if (stcb) { av->assoc_value = stcb->asoc.context; SCTP_TCB_UNLOCK(stcb); } else { if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) || (av->assoc_id == SCTP_FUTURE_ASSOC)) { SCTP_INP_RLOCK(inp); av->assoc_value = inp->sctp_context; SCTP_INP_RUNLOCK(inp); } else { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; } } if (error == 0) { *optsize = sizeof(struct sctp_assoc_value); } break; } case SCTP_VRF_ID: { uint32_t *default_vrfid; SCTP_CHECK_AND_CAST(default_vrfid, optval, uint32_t, *optsize); *default_vrfid = inp->def_vrf_id; *optsize = sizeof(uint32_t); break; } case SCTP_GET_ASOC_VRF: { struct sctp_assoc_value *id; SCTP_CHECK_AND_CAST(id, optval, struct sctp_assoc_value, *optsize); SCTP_FIND_STCB(inp, stcb, id->assoc_id); if (stcb == NULL) { error = EINVAL; SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error); } else { id->assoc_value = stcb->asoc.vrf_id; *optsize = sizeof(struct sctp_assoc_value); } break; } case SCTP_GET_VRF_IDS: { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EOPNOTSUPP); error = EOPNOTSUPP; break; } case SCTP_GET_NONCE_VALUES: { struct sctp_get_nonce_values *gnv; SCTP_CHECK_AND_CAST(gnv, optval, struct sctp_get_nonce_values, *optsize); SCTP_FIND_STCB(inp, stcb, gnv->gn_assoc_id); if (stcb) { gnv->gn_peers_tag = stcb->asoc.peer_vtag; gnv->gn_local_tag = stcb->asoc.my_vtag; SCTP_TCB_UNLOCK(stcb); *optsize = sizeof(struct sctp_get_nonce_values); } else { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOTCONN); error = ENOTCONN; } break; } case SCTP_DELAYED_SACK: { struct sctp_sack_info *sack; SCTP_CHECK_AND_CAST(sack, optval, struct sctp_sack_info, *optsize); SCTP_FIND_STCB(inp, stcb, sack->sack_assoc_id); if (stcb) { sack->sack_delay = stcb->asoc.delayed_ack; sack->sack_freq = stcb->asoc.sack_freq; SCTP_TCB_UNLOCK(stcb); } else { if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) || (sack->sack_assoc_id == SCTP_FUTURE_ASSOC)) { SCTP_INP_RLOCK(inp); sack->sack_delay = TICKS_TO_MSEC(inp->sctp_ep.sctp_timeoutticks[SCTP_TIMER_RECV]); sack->sack_freq = inp->sctp_ep.sctp_sack_freq; SCTP_INP_RUNLOCK(inp); } else { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; } } if (error == 0) { *optsize = sizeof(struct sctp_sack_info); } break; } case SCTP_GET_SNDBUF_USE: { struct sctp_sockstat *ss; SCTP_CHECK_AND_CAST(ss, optval, struct sctp_sockstat, *optsize); SCTP_FIND_STCB(inp, stcb, ss->ss_assoc_id); if (stcb) { ss->ss_total_sndbuf = stcb->asoc.total_output_queue_size; ss->ss_total_recv_buf = (stcb->asoc.size_on_reasm_queue + stcb->asoc.size_on_all_streams); SCTP_TCB_UNLOCK(stcb); *optsize = sizeof(struct sctp_sockstat); } else { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOTCONN); error = ENOTCONN; } break; } case SCTP_MAX_BURST: { struct sctp_assoc_value *av; SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, *optsize); SCTP_FIND_STCB(inp, stcb, av->assoc_id); if (stcb) { av->assoc_value = stcb->asoc.max_burst; SCTP_TCB_UNLOCK(stcb); } else { if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) || (av->assoc_id == SCTP_FUTURE_ASSOC)) { SCTP_INP_RLOCK(inp); av->assoc_value = inp->sctp_ep.max_burst; SCTP_INP_RUNLOCK(inp); } else { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; } } if (error == 0) { *optsize = sizeof(struct sctp_assoc_value); } break; } case SCTP_MAXSEG: { struct sctp_assoc_value *av; int ovh; SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, *optsize); SCTP_FIND_STCB(inp, stcb, av->assoc_id); if (stcb) { av->assoc_value = sctp_get_frag_point(stcb, &stcb->asoc); SCTP_TCB_UNLOCK(stcb); } else { if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) || (av->assoc_id == SCTP_FUTURE_ASSOC)) { SCTP_INP_RLOCK(inp); if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) { ovh = SCTP_MED_OVERHEAD; } else { ovh = SCTP_MED_V4_OVERHEAD; } if (inp->sctp_frag_point >= SCTP_DEFAULT_MAXSEGMENT) av->assoc_value = 0; else av->assoc_value = inp->sctp_frag_point - ovh; SCTP_INP_RUNLOCK(inp); } else { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; } } if (error == 0) { *optsize = sizeof(struct sctp_assoc_value); } break; } case SCTP_GET_STAT_LOG: error = sctp_fill_stat_log(optval, optsize); break; case SCTP_EVENTS: { struct sctp_event_subscribe *events; SCTP_CHECK_AND_CAST(events, optval, struct sctp_event_subscribe, *optsize); memset(events, 0, sizeof(struct sctp_event_subscribe)); SCTP_INP_RLOCK(inp); if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_RECVDATAIOEVNT)) events->sctp_data_io_event = 1; if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_RECVASSOCEVNT)) events->sctp_association_event = 1; if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_RECVPADDREVNT)) events->sctp_address_event = 1; if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_RECVSENDFAILEVNT)) events->sctp_send_failure_event = 1; if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_RECVPEERERR)) events->sctp_peer_error_event = 1; if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_RECVSHUTDOWNEVNT)) events->sctp_shutdown_event = 1; if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_PDAPIEVNT)) events->sctp_partial_delivery_event = 1; if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_ADAPTATIONEVNT)) events->sctp_adaptation_layer_event = 1; if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_AUTHEVNT)) events->sctp_authentication_event = 1; if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_DRYEVNT)) events->sctp_sender_dry_event = 1; if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_STREAM_RESETEVNT)) events->sctp_stream_reset_event = 1; SCTP_INP_RUNLOCK(inp); *optsize = sizeof(struct sctp_event_subscribe); break; } case SCTP_ADAPTATION_LAYER: { uint32_t *value; SCTP_CHECK_AND_CAST(value, optval, uint32_t, *optsize); SCTP_INP_RLOCK(inp); *value = inp->sctp_ep.adaptation_layer_indicator; SCTP_INP_RUNLOCK(inp); *optsize = sizeof(uint32_t); break; } case SCTP_SET_INITIAL_DBG_SEQ: { uint32_t *value; SCTP_CHECK_AND_CAST(value, optval, uint32_t, *optsize); SCTP_INP_RLOCK(inp); *value = inp->sctp_ep.initial_sequence_debug; SCTP_INP_RUNLOCK(inp); *optsize = sizeof(uint32_t); break; } case SCTP_GET_LOCAL_ADDR_SIZE: { uint32_t *value; SCTP_CHECK_AND_CAST(value, optval, uint32_t, *optsize); SCTP_INP_RLOCK(inp); *value = sctp_count_max_addresses(inp); SCTP_INP_RUNLOCK(inp); *optsize = sizeof(uint32_t); break; } case SCTP_GET_REMOTE_ADDR_SIZE: { uint32_t *value; size_t size; struct sctp_nets *net; SCTP_CHECK_AND_CAST(value, optval, uint32_t, *optsize); /* FIXME MT: change to sctp_assoc_value? */ SCTP_FIND_STCB(inp, stcb, (sctp_assoc_t)*value); if (stcb) { size = 0; /* Count the sizes */ TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) { switch (net->ro._l_addr.sa.sa_family) { #ifdef INET case AF_INET: #ifdef INET6 if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_NEEDS_MAPPED_V4)) { size += sizeof(struct sockaddr_in6); } else { size += sizeof(struct sockaddr_in); } #else size += sizeof(struct sockaddr_in); #endif break; #endif #ifdef INET6 case AF_INET6: size += sizeof(struct sockaddr_in6); break; #endif default: break; } } SCTP_TCB_UNLOCK(stcb); *value = (uint32_t)size; *optsize = sizeof(uint32_t); } else { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOTCONN); error = ENOTCONN; } break; } case SCTP_GET_PEER_ADDRESSES: /* * Get the address information, an array is passed in to * fill up we pack it. */ { size_t cpsz, left; struct sockaddr_storage *sas; struct sctp_nets *net; struct sctp_getaddresses *saddr; SCTP_CHECK_AND_CAST(saddr, optval, struct sctp_getaddresses, *optsize); SCTP_FIND_STCB(inp, stcb, saddr->sget_assoc_id); if (stcb) { left = (*optsize) - sizeof(struct sctp_getaddresses); *optsize = sizeof(struct sctp_getaddresses); sas = (struct sockaddr_storage *)&saddr->addr[0]; TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) { switch (net->ro._l_addr.sa.sa_family) { #ifdef INET case AF_INET: #ifdef INET6 if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_NEEDS_MAPPED_V4)) { cpsz = sizeof(struct sockaddr_in6); } else { cpsz = sizeof(struct sockaddr_in); } #else cpsz = sizeof(struct sockaddr_in); #endif break; #endif #ifdef INET6 case AF_INET6: cpsz = sizeof(struct sockaddr_in6); break; #endif default: cpsz = 0; break; } if (cpsz == 0) { break; } if (left < cpsz) { /* not enough room. */ break; } #if defined(INET) && defined(INET6) if ((sctp_is_feature_on(inp, SCTP_PCB_FLAGS_NEEDS_MAPPED_V4)) && (net->ro._l_addr.sa.sa_family == AF_INET)) { /* Must map the address */ in6_sin_2_v4mapsin6(&net->ro._l_addr.sin, (struct sockaddr_in6 *)sas); } else { memcpy(sas, &net->ro._l_addr, cpsz); } #else memcpy(sas, &net->ro._l_addr, cpsz); #endif ((struct sockaddr_in *)sas)->sin_port = stcb->rport; sas = (struct sockaddr_storage *)((caddr_t)sas + cpsz); left -= cpsz; *optsize += cpsz; } SCTP_TCB_UNLOCK(stcb); } else { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOENT); error = ENOENT; } break; } case SCTP_GET_LOCAL_ADDRESSES: { size_t limit, actual; struct sockaddr_storage *sas; struct sctp_getaddresses *saddr; SCTP_CHECK_AND_CAST(saddr, optval, struct sctp_getaddresses, *optsize); SCTP_FIND_STCB(inp, stcb, saddr->sget_assoc_id); sas = (struct sockaddr_storage *)&saddr->addr[0]; limit = *optsize - sizeof(sctp_assoc_t); actual = sctp_fill_up_addresses(inp, stcb, limit, sas); if (stcb) { SCTP_TCB_UNLOCK(stcb); } *optsize = sizeof(struct sockaddr_storage) + actual; break; } case SCTP_PEER_ADDR_PARAMS: { struct sctp_paddrparams *paddrp; struct sctp_nets *net; struct sockaddr *addr; #if defined(INET) && defined(INET6) struct sockaddr_in sin_store; #endif SCTP_CHECK_AND_CAST(paddrp, optval, struct sctp_paddrparams, *optsize); SCTP_FIND_STCB(inp, stcb, paddrp->spp_assoc_id); #if defined(INET) && defined(INET6) if (paddrp->spp_address.ss_family == AF_INET6) { struct sockaddr_in6 *sin6; sin6 = (struct sockaddr_in6 *)&paddrp->spp_address; if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { in6_sin6_2_sin(&sin_store, sin6); addr = (struct sockaddr *)&sin_store; } else { addr = (struct sockaddr *)&paddrp->spp_address; } } else { addr = (struct sockaddr *)&paddrp->spp_address; } #else addr = (struct sockaddr *)&paddrp->spp_address; #endif if (stcb != NULL) { net = sctp_findnet(stcb, addr); } else { /* * We increment here since * sctp_findassociation_ep_addr() wil do a * decrement if it finds the stcb as long as * the locked tcb (last argument) is NOT a * TCB.. aka NULL. */ net = NULL; SCTP_INP_INCR_REF(inp); stcb = sctp_findassociation_ep_addr(&inp, addr, &net, NULL, NULL); if (stcb == NULL) { SCTP_INP_DECR_REF(inp); } } if ((stcb != NULL) && (net == NULL)) { #ifdef INET if (addr->sa_family == AF_INET) { struct sockaddr_in *sin; sin = (struct sockaddr_in *)addr; if (sin->sin_addr.s_addr != INADDR_ANY) { error = EINVAL; SCTP_TCB_UNLOCK(stcb); SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error); break; } } else #endif #ifdef INET6 if (addr->sa_family == AF_INET6) { struct sockaddr_in6 *sin6; sin6 = (struct sockaddr_in6 *)addr; if (!IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) { error = EINVAL; SCTP_TCB_UNLOCK(stcb); SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error); break; } } else #endif { error = EAFNOSUPPORT; SCTP_TCB_UNLOCK(stcb); SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error); break; } } if (stcb != NULL) { /* Applies to the specific association */ paddrp->spp_flags = 0; if (net != NULL) { paddrp->spp_hbinterval = net->heart_beat_delay; paddrp->spp_pathmaxrxt = net->failure_threshold; paddrp->spp_pathmtu = net->mtu; switch (net->ro._l_addr.sa.sa_family) { #ifdef INET case AF_INET: paddrp->spp_pathmtu -= SCTP_MIN_V4_OVERHEAD; break; #endif #ifdef INET6 case AF_INET6: paddrp->spp_pathmtu -= SCTP_MIN_V4_OVERHEAD; break; #endif default: break; } /* get flags for HB */ if (net->dest_state & SCTP_ADDR_NOHB) { paddrp->spp_flags |= SPP_HB_DISABLE; } else { paddrp->spp_flags |= SPP_HB_ENABLE; } /* get flags for PMTU */ if (net->dest_state & SCTP_ADDR_NO_PMTUD) { paddrp->spp_flags |= SPP_PMTUD_DISABLE; } else { paddrp->spp_flags |= SPP_PMTUD_ENABLE; } if (net->dscp & 0x01) { paddrp->spp_dscp = net->dscp & 0xfc; paddrp->spp_flags |= SPP_DSCP; } #ifdef INET6 if ((net->ro._l_addr.sa.sa_family == AF_INET6) && (net->flowlabel & 0x80000000)) { paddrp->spp_ipv6_flowlabel = net->flowlabel & 0x000fffff; paddrp->spp_flags |= SPP_IPV6_FLOWLABEL; } #endif } else { /* * No destination so return default * value */ paddrp->spp_pathmaxrxt = stcb->asoc.def_net_failure; paddrp->spp_pathmtu = 0; if (stcb->asoc.default_dscp & 0x01) { paddrp->spp_dscp = stcb->asoc.default_dscp & 0xfc; paddrp->spp_flags |= SPP_DSCP; } #ifdef INET6 if (stcb->asoc.default_flowlabel & 0x80000000) { paddrp->spp_ipv6_flowlabel = stcb->asoc.default_flowlabel & 0x000fffff; paddrp->spp_flags |= SPP_IPV6_FLOWLABEL; } #endif /* default settings should be these */ if (sctp_stcb_is_feature_on(inp, stcb, SCTP_PCB_FLAGS_DONOT_HEARTBEAT)) { paddrp->spp_flags |= SPP_HB_DISABLE; } else { paddrp->spp_flags |= SPP_HB_ENABLE; } if (sctp_stcb_is_feature_on(inp, stcb, SCTP_PCB_FLAGS_DO_NOT_PMTUD)) { paddrp->spp_flags |= SPP_PMTUD_DISABLE; } else { paddrp->spp_flags |= SPP_PMTUD_ENABLE; } paddrp->spp_hbinterval = stcb->asoc.heart_beat_delay; } paddrp->spp_assoc_id = sctp_get_associd(stcb); SCTP_TCB_UNLOCK(stcb); } else { if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) || (paddrp->spp_assoc_id == SCTP_FUTURE_ASSOC)) { /* Use endpoint defaults */ SCTP_INP_RLOCK(inp); paddrp->spp_pathmaxrxt = inp->sctp_ep.def_net_failure; paddrp->spp_hbinterval = TICKS_TO_MSEC(inp->sctp_ep.sctp_timeoutticks[SCTP_TIMER_HEARTBEAT]); paddrp->spp_assoc_id = SCTP_FUTURE_ASSOC; /* get inp's default */ if (inp->sctp_ep.default_dscp & 0x01) { paddrp->spp_dscp = inp->sctp_ep.default_dscp & 0xfc; paddrp->spp_flags |= SPP_DSCP; } #ifdef INET6 if ((inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) && (inp->sctp_ep.default_flowlabel & 0x80000000)) { paddrp->spp_ipv6_flowlabel = inp->sctp_ep.default_flowlabel & 0x000fffff; paddrp->spp_flags |= SPP_IPV6_FLOWLABEL; } #endif /* can't return this */ paddrp->spp_pathmtu = 0; if (sctp_is_feature_off(inp, SCTP_PCB_FLAGS_DONOT_HEARTBEAT)) { paddrp->spp_flags |= SPP_HB_ENABLE; } else { paddrp->spp_flags |= SPP_HB_DISABLE; } if (sctp_is_feature_off(inp, SCTP_PCB_FLAGS_DO_NOT_PMTUD)) { paddrp->spp_flags |= SPP_PMTUD_ENABLE; } else { paddrp->spp_flags |= SPP_PMTUD_DISABLE; } SCTP_INP_RUNLOCK(inp); } else { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; } } if (error == 0) { *optsize = sizeof(struct sctp_paddrparams); } break; } case SCTP_GET_PEER_ADDR_INFO: { struct sctp_paddrinfo *paddri; struct sctp_nets *net; struct sockaddr *addr; #if defined(INET) && defined(INET6) struct sockaddr_in sin_store; #endif SCTP_CHECK_AND_CAST(paddri, optval, struct sctp_paddrinfo, *optsize); SCTP_FIND_STCB(inp, stcb, paddri->spinfo_assoc_id); #if defined(INET) && defined(INET6) if (paddri->spinfo_address.ss_family == AF_INET6) { struct sockaddr_in6 *sin6; sin6 = (struct sockaddr_in6 *)&paddri->spinfo_address; if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { in6_sin6_2_sin(&sin_store, sin6); addr = (struct sockaddr *)&sin_store; } else { addr = (struct sockaddr *)&paddri->spinfo_address; } } else { addr = (struct sockaddr *)&paddri->spinfo_address; } #else addr = (struct sockaddr *)&paddri->spinfo_address; #endif if (stcb != NULL) { net = sctp_findnet(stcb, addr); } else { /* * We increment here since * sctp_findassociation_ep_addr() wil do a * decrement if it finds the stcb as long as * the locked tcb (last argument) is NOT a * TCB.. aka NULL. */ net = NULL; SCTP_INP_INCR_REF(inp); stcb = sctp_findassociation_ep_addr(&inp, addr, &net, NULL, NULL); if (stcb == NULL) { SCTP_INP_DECR_REF(inp); } } if ((stcb != NULL) && (net != NULL)) { if (net->dest_state & SCTP_ADDR_UNCONFIRMED) { /* It's unconfirmed */ paddri->spinfo_state = SCTP_UNCONFIRMED; } else if (net->dest_state & SCTP_ADDR_REACHABLE) { /* It's active */ paddri->spinfo_state = SCTP_ACTIVE; } else { /* It's inactive */ paddri->spinfo_state = SCTP_INACTIVE; } paddri->spinfo_cwnd = net->cwnd; paddri->spinfo_srtt = net->lastsa >> SCTP_RTT_SHIFT; paddri->spinfo_rto = net->RTO; paddri->spinfo_assoc_id = sctp_get_associd(stcb); paddri->spinfo_mtu = net->mtu; switch (addr->sa_family) { #if defined(INET) case AF_INET: paddri->spinfo_mtu -= SCTP_MIN_V4_OVERHEAD; break; #endif #if defined(INET6) case AF_INET6: paddri->spinfo_mtu -= SCTP_MIN_OVERHEAD; break; #endif default: break; } SCTP_TCB_UNLOCK(stcb); *optsize = sizeof(struct sctp_paddrinfo); } else { if (stcb != NULL) { SCTP_TCB_UNLOCK(stcb); } SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOENT); error = ENOENT; } break; } case SCTP_PCB_STATUS: { struct sctp_pcbinfo *spcb; SCTP_CHECK_AND_CAST(spcb, optval, struct sctp_pcbinfo, *optsize); sctp_fill_pcbinfo(spcb); *optsize = sizeof(struct sctp_pcbinfo); break; } case SCTP_STATUS: { struct sctp_nets *net; struct sctp_status *sstat; SCTP_CHECK_AND_CAST(sstat, optval, struct sctp_status, *optsize); SCTP_FIND_STCB(inp, stcb, sstat->sstat_assoc_id); if (stcb == NULL) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; break; } sstat->sstat_state = sctp_map_assoc_state(stcb->asoc.state); sstat->sstat_assoc_id = sctp_get_associd(stcb); sstat->sstat_rwnd = stcb->asoc.peers_rwnd; sstat->sstat_unackdata = stcb->asoc.sent_queue_cnt; /* * We can't include chunks that have been passed to * the socket layer. Only things in queue. */ sstat->sstat_penddata = (stcb->asoc.cnt_on_reasm_queue + stcb->asoc.cnt_on_all_streams); sstat->sstat_instrms = stcb->asoc.streamincnt; sstat->sstat_outstrms = stcb->asoc.streamoutcnt; sstat->sstat_fragmentation_point = sctp_get_frag_point(stcb, &stcb->asoc); memcpy(&sstat->sstat_primary.spinfo_address, &stcb->asoc.primary_destination->ro._l_addr, ((struct sockaddr *)(&stcb->asoc.primary_destination->ro._l_addr))->sa_len); net = stcb->asoc.primary_destination; ((struct sockaddr_in *)&sstat->sstat_primary.spinfo_address)->sin_port = stcb->rport; /* * Again the user can get info from sctp_constants.h * for what the state of the network is. */ if (net->dest_state & SCTP_ADDR_UNCONFIRMED) { /* It's unconfirmed */ sstat->sstat_primary.spinfo_state = SCTP_UNCONFIRMED; } else if (net->dest_state & SCTP_ADDR_REACHABLE) { /* It's active */ sstat->sstat_primary.spinfo_state = SCTP_ACTIVE; } else { /* It's inactive */ sstat->sstat_primary.spinfo_state = SCTP_INACTIVE; } sstat->sstat_primary.spinfo_cwnd = net->cwnd; sstat->sstat_primary.spinfo_srtt = net->lastsa >> SCTP_RTT_SHIFT; sstat->sstat_primary.spinfo_rto = net->RTO; sstat->sstat_primary.spinfo_mtu = net->mtu; switch (stcb->asoc.primary_destination->ro._l_addr.sa.sa_family) { #if defined(INET) case AF_INET: sstat->sstat_primary.spinfo_mtu -= SCTP_MIN_V4_OVERHEAD; break; #endif #if defined(INET6) case AF_INET6: sstat->sstat_primary.spinfo_mtu -= SCTP_MIN_OVERHEAD; break; #endif default: break; } sstat->sstat_primary.spinfo_assoc_id = sctp_get_associd(stcb); SCTP_TCB_UNLOCK(stcb); *optsize = sizeof(struct sctp_status); break; } case SCTP_RTOINFO: { struct sctp_rtoinfo *srto; SCTP_CHECK_AND_CAST(srto, optval, struct sctp_rtoinfo, *optsize); SCTP_FIND_STCB(inp, stcb, srto->srto_assoc_id); if (stcb) { srto->srto_initial = stcb->asoc.initial_rto; srto->srto_max = stcb->asoc.maxrto; srto->srto_min = stcb->asoc.minrto; SCTP_TCB_UNLOCK(stcb); } else { if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) || (srto->srto_assoc_id == SCTP_FUTURE_ASSOC)) { SCTP_INP_RLOCK(inp); srto->srto_initial = inp->sctp_ep.initial_rto; srto->srto_max = inp->sctp_ep.sctp_maxrto; srto->srto_min = inp->sctp_ep.sctp_minrto; SCTP_INP_RUNLOCK(inp); } else { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; } } if (error == 0) { *optsize = sizeof(struct sctp_rtoinfo); } break; } case SCTP_TIMEOUTS: { struct sctp_timeouts *stimo; SCTP_CHECK_AND_CAST(stimo, optval, struct sctp_timeouts, *optsize); SCTP_FIND_STCB(inp, stcb, stimo->stimo_assoc_id); if (stcb) { stimo->stimo_init = stcb->asoc.timoinit; stimo->stimo_data = stcb->asoc.timodata; stimo->stimo_sack = stcb->asoc.timosack; stimo->stimo_shutdown = stcb->asoc.timoshutdown; stimo->stimo_heartbeat = stcb->asoc.timoheartbeat; stimo->stimo_cookie = stcb->asoc.timocookie; stimo->stimo_shutdownack = stcb->asoc.timoshutdownack; SCTP_TCB_UNLOCK(stcb); *optsize = sizeof(struct sctp_timeouts); } else { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; } break; } case SCTP_ASSOCINFO: { struct sctp_assocparams *sasoc; SCTP_CHECK_AND_CAST(sasoc, optval, struct sctp_assocparams, *optsize); SCTP_FIND_STCB(inp, stcb, sasoc->sasoc_assoc_id); if (stcb) { sasoc->sasoc_cookie_life = TICKS_TO_MSEC(stcb->asoc.cookie_life); sasoc->sasoc_asocmaxrxt = stcb->asoc.max_send_times; sasoc->sasoc_number_peer_destinations = stcb->asoc.numnets; sasoc->sasoc_peer_rwnd = stcb->asoc.peers_rwnd; sasoc->sasoc_local_rwnd = stcb->asoc.my_rwnd; SCTP_TCB_UNLOCK(stcb); } else { if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) || (sasoc->sasoc_assoc_id == SCTP_FUTURE_ASSOC)) { SCTP_INP_RLOCK(inp); sasoc->sasoc_cookie_life = TICKS_TO_MSEC(inp->sctp_ep.def_cookie_life); sasoc->sasoc_asocmaxrxt = inp->sctp_ep.max_send_times; sasoc->sasoc_number_peer_destinations = 0; sasoc->sasoc_peer_rwnd = 0; sasoc->sasoc_local_rwnd = sbspace(&inp->sctp_socket->so_rcv); SCTP_INP_RUNLOCK(inp); } else { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; } } if (error == 0) { *optsize = sizeof(struct sctp_assocparams); } break; } case SCTP_DEFAULT_SEND_PARAM: { struct sctp_sndrcvinfo *s_info; SCTP_CHECK_AND_CAST(s_info, optval, struct sctp_sndrcvinfo, *optsize); SCTP_FIND_STCB(inp, stcb, s_info->sinfo_assoc_id); if (stcb) { memcpy(s_info, &stcb->asoc.def_send, sizeof(stcb->asoc.def_send)); SCTP_TCB_UNLOCK(stcb); } else { if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) || (s_info->sinfo_assoc_id == SCTP_FUTURE_ASSOC)) { SCTP_INP_RLOCK(inp); memcpy(s_info, &inp->def_send, sizeof(inp->def_send)); SCTP_INP_RUNLOCK(inp); } else { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; } } if (error == 0) { *optsize = sizeof(struct sctp_sndrcvinfo); } break; } case SCTP_INITMSG: { struct sctp_initmsg *sinit; SCTP_CHECK_AND_CAST(sinit, optval, struct sctp_initmsg, *optsize); SCTP_INP_RLOCK(inp); sinit->sinit_num_ostreams = inp->sctp_ep.pre_open_stream_count; sinit->sinit_max_instreams = inp->sctp_ep.max_open_streams_intome; sinit->sinit_max_attempts = inp->sctp_ep.max_init_times; sinit->sinit_max_init_timeo = inp->sctp_ep.initial_init_rto_max; SCTP_INP_RUNLOCK(inp); *optsize = sizeof(struct sctp_initmsg); break; } case SCTP_PRIMARY_ADDR: /* we allow a "get" operation on this */ { struct sctp_setprim *ssp; SCTP_CHECK_AND_CAST(ssp, optval, struct sctp_setprim, *optsize); SCTP_FIND_STCB(inp, stcb, ssp->ssp_assoc_id); if (stcb) { union sctp_sockstore *addr; addr = &stcb->asoc.primary_destination->ro._l_addr; switch (addr->sa.sa_family) { #ifdef INET case AF_INET: #ifdef INET6 if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_NEEDS_MAPPED_V4)) { in6_sin_2_v4mapsin6(&addr->sin, (struct sockaddr_in6 *)&ssp->ssp_addr); } else { memcpy(&ssp->ssp_addr, &addr->sin, sizeof(struct sockaddr_in)); } #else memcpy(&ssp->ssp_addr, &addr->sin, sizeof(struct sockaddr_in)); #endif break; #endif #ifdef INET6 case AF_INET6: memcpy(&ssp->ssp_addr, &addr->sin6, sizeof(struct sockaddr_in6)); break; #endif default: break; } SCTP_TCB_UNLOCK(stcb); *optsize = sizeof(struct sctp_setprim); } else { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; } break; } case SCTP_HMAC_IDENT: { struct sctp_hmacalgo *shmac; sctp_hmaclist_t *hmaclist; uint32_t size; int i; SCTP_CHECK_AND_CAST(shmac, optval, struct sctp_hmacalgo, *optsize); SCTP_INP_RLOCK(inp); hmaclist = inp->sctp_ep.local_hmacs; if (hmaclist == NULL) { /* no HMACs to return */ *optsize = sizeof(*shmac); SCTP_INP_RUNLOCK(inp); break; } /* is there room for all of the hmac ids? */ size = sizeof(*shmac) + (hmaclist->num_algo * sizeof(shmac->shmac_idents[0])); if ((size_t)(*optsize) < size) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; SCTP_INP_RUNLOCK(inp); break; } /* copy in the list */ shmac->shmac_number_of_idents = hmaclist->num_algo; for (i = 0; i < hmaclist->num_algo; i++) { shmac->shmac_idents[i] = hmaclist->hmac[i]; } SCTP_INP_RUNLOCK(inp); *optsize = size; break; } case SCTP_AUTH_ACTIVE_KEY: { struct sctp_authkeyid *scact; SCTP_CHECK_AND_CAST(scact, optval, struct sctp_authkeyid, *optsize); SCTP_FIND_STCB(inp, stcb, scact->scact_assoc_id); if (stcb) { /* get the active key on the assoc */ scact->scact_keynumber = stcb->asoc.authinfo.active_keyid; SCTP_TCB_UNLOCK(stcb); } else { if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) || (scact->scact_assoc_id == SCTP_FUTURE_ASSOC)) { /* get the endpoint active key */ SCTP_INP_RLOCK(inp); scact->scact_keynumber = inp->sctp_ep.default_keyid; SCTP_INP_RUNLOCK(inp); } else { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; } } if (error == 0) { *optsize = sizeof(struct sctp_authkeyid); } break; } case SCTP_LOCAL_AUTH_CHUNKS: { struct sctp_authchunks *sac; sctp_auth_chklist_t *chklist = NULL; size_t size = 0; SCTP_CHECK_AND_CAST(sac, optval, struct sctp_authchunks, *optsize); SCTP_FIND_STCB(inp, stcb, sac->gauth_assoc_id); if (stcb) { /* get off the assoc */ chklist = stcb->asoc.local_auth_chunks; /* is there enough space? */ size = sctp_auth_get_chklist_size(chklist); if (*optsize < (sizeof(struct sctp_authchunks) + size)) { error = EINVAL; SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error); } else { /* copy in the chunks */ (void)sctp_serialize_auth_chunks(chklist, sac->gauth_chunks); sac->gauth_number_of_chunks = (uint32_t)size; *optsize = sizeof(struct sctp_authchunks) + size; } SCTP_TCB_UNLOCK(stcb); } else { if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) || (sac->gauth_assoc_id == SCTP_FUTURE_ASSOC)) { /* get off the endpoint */ SCTP_INP_RLOCK(inp); chklist = inp->sctp_ep.local_auth_chunks; /* is there enough space? */ size = sctp_auth_get_chklist_size(chklist); if (*optsize < (sizeof(struct sctp_authchunks) + size)) { error = EINVAL; SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error); } else { /* copy in the chunks */ (void)sctp_serialize_auth_chunks(chklist, sac->gauth_chunks); sac->gauth_number_of_chunks = (uint32_t)size; *optsize = sizeof(struct sctp_authchunks) + size; } SCTP_INP_RUNLOCK(inp); } else { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; } } break; } case SCTP_PEER_AUTH_CHUNKS: { struct sctp_authchunks *sac; sctp_auth_chklist_t *chklist = NULL; size_t size = 0; SCTP_CHECK_AND_CAST(sac, optval, struct sctp_authchunks, *optsize); SCTP_FIND_STCB(inp, stcb, sac->gauth_assoc_id); if (stcb) { /* get off the assoc */ chklist = stcb->asoc.peer_auth_chunks; /* is there enough space? */ size = sctp_auth_get_chklist_size(chklist); if (*optsize < (sizeof(struct sctp_authchunks) + size)) { error = EINVAL; SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error); } else { /* copy in the chunks */ (void)sctp_serialize_auth_chunks(chklist, sac->gauth_chunks); sac->gauth_number_of_chunks = (uint32_t)size; *optsize = sizeof(struct sctp_authchunks) + size; } SCTP_TCB_UNLOCK(stcb); } else { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOENT); error = ENOENT; } break; } case SCTP_EVENT: { struct sctp_event *event; uint32_t event_type; SCTP_CHECK_AND_CAST(event, optval, struct sctp_event, *optsize); SCTP_FIND_STCB(inp, stcb, event->se_assoc_id); switch (event->se_type) { case SCTP_ASSOC_CHANGE: event_type = SCTP_PCB_FLAGS_RECVASSOCEVNT; break; case SCTP_PEER_ADDR_CHANGE: event_type = SCTP_PCB_FLAGS_RECVPADDREVNT; break; case SCTP_REMOTE_ERROR: event_type = SCTP_PCB_FLAGS_RECVPEERERR; break; case SCTP_SEND_FAILED: event_type = SCTP_PCB_FLAGS_RECVSENDFAILEVNT; break; case SCTP_SHUTDOWN_EVENT: event_type = SCTP_PCB_FLAGS_RECVSHUTDOWNEVNT; break; case SCTP_ADAPTATION_INDICATION: event_type = SCTP_PCB_FLAGS_ADAPTATIONEVNT; break; case SCTP_PARTIAL_DELIVERY_EVENT: event_type = SCTP_PCB_FLAGS_PDAPIEVNT; break; case SCTP_AUTHENTICATION_EVENT: event_type = SCTP_PCB_FLAGS_AUTHEVNT; break; case SCTP_STREAM_RESET_EVENT: event_type = SCTP_PCB_FLAGS_STREAM_RESETEVNT; break; case SCTP_SENDER_DRY_EVENT: event_type = SCTP_PCB_FLAGS_DRYEVNT; break; case SCTP_NOTIFICATIONS_STOPPED_EVENT: event_type = 0; SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOTSUP); error = ENOTSUP; break; case SCTP_ASSOC_RESET_EVENT: event_type = SCTP_PCB_FLAGS_ASSOC_RESETEVNT; break; case SCTP_STREAM_CHANGE_EVENT: event_type = SCTP_PCB_FLAGS_STREAM_CHANGEEVNT; break; case SCTP_SEND_FAILED_EVENT: event_type = SCTP_PCB_FLAGS_RECVNSENDFAILEVNT; break; default: event_type = 0; SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; break; } if (event_type > 0) { if (stcb) { event->se_on = sctp_stcb_is_feature_on(inp, stcb, event_type); SCTP_TCB_UNLOCK(stcb); } else { if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) || (event->se_assoc_id == SCTP_FUTURE_ASSOC)) { SCTP_INP_RLOCK(inp); event->se_on = sctp_is_feature_on(inp, event_type); SCTP_INP_RUNLOCK(inp); } else { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; } } } if (error == 0) { *optsize = sizeof(struct sctp_event); } break; } case SCTP_RECVRCVINFO: { int onoff; if (*optsize < sizeof(int)) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; } else { SCTP_INP_RLOCK(inp); onoff = sctp_is_feature_on(inp, SCTP_PCB_FLAGS_RECVRCVINFO); SCTP_INP_RUNLOCK(inp); } if (error == 0) { /* return the option value */ *(int *)optval = onoff; *optsize = sizeof(int); } break; } case SCTP_RECVNXTINFO: { int onoff; if (*optsize < sizeof(int)) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; } else { SCTP_INP_RLOCK(inp); onoff = sctp_is_feature_on(inp, SCTP_PCB_FLAGS_RECVNXTINFO); SCTP_INP_RUNLOCK(inp); } if (error == 0) { /* return the option value */ *(int *)optval = onoff; *optsize = sizeof(int); } break; } case SCTP_DEFAULT_SNDINFO: { struct sctp_sndinfo *info; SCTP_CHECK_AND_CAST(info, optval, struct sctp_sndinfo, *optsize); SCTP_FIND_STCB(inp, stcb, info->snd_assoc_id); if (stcb) { info->snd_sid = stcb->asoc.def_send.sinfo_stream; info->snd_flags = stcb->asoc.def_send.sinfo_flags; info->snd_flags &= 0xfff0; info->snd_ppid = stcb->asoc.def_send.sinfo_ppid; info->snd_context = stcb->asoc.def_send.sinfo_context; SCTP_TCB_UNLOCK(stcb); } else { if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) || (info->snd_assoc_id == SCTP_FUTURE_ASSOC)) { SCTP_INP_RLOCK(inp); info->snd_sid = inp->def_send.sinfo_stream; info->snd_flags = inp->def_send.sinfo_flags; info->snd_flags &= 0xfff0; info->snd_ppid = inp->def_send.sinfo_ppid; info->snd_context = inp->def_send.sinfo_context; SCTP_INP_RUNLOCK(inp); } else { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; } } if (error == 0) { *optsize = sizeof(struct sctp_sndinfo); } break; } case SCTP_DEFAULT_PRINFO: { struct sctp_default_prinfo *info; SCTP_CHECK_AND_CAST(info, optval, struct sctp_default_prinfo, *optsize); SCTP_FIND_STCB(inp, stcb, info->pr_assoc_id); if (stcb) { info->pr_policy = PR_SCTP_POLICY(stcb->asoc.def_send.sinfo_flags); info->pr_value = stcb->asoc.def_send.sinfo_timetolive; SCTP_TCB_UNLOCK(stcb); } else { if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) || (info->pr_assoc_id == SCTP_FUTURE_ASSOC)) { SCTP_INP_RLOCK(inp); info->pr_policy = PR_SCTP_POLICY(inp->def_send.sinfo_flags); info->pr_value = inp->def_send.sinfo_timetolive; SCTP_INP_RUNLOCK(inp); } else { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; } } if (error == 0) { *optsize = sizeof(struct sctp_default_prinfo); } break; } case SCTP_PEER_ADDR_THLDS: { struct sctp_paddrthlds *thlds; struct sctp_nets *net; struct sockaddr *addr; #if defined(INET) && defined(INET6) struct sockaddr_in sin_store; #endif SCTP_CHECK_AND_CAST(thlds, optval, struct sctp_paddrthlds, *optsize); SCTP_FIND_STCB(inp, stcb, thlds->spt_assoc_id); #if defined(INET) && defined(INET6) if (thlds->spt_address.ss_family == AF_INET6) { struct sockaddr_in6 *sin6; sin6 = (struct sockaddr_in6 *)&thlds->spt_address; if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { in6_sin6_2_sin(&sin_store, sin6); addr = (struct sockaddr *)&sin_store; } else { addr = (struct sockaddr *)&thlds->spt_address; } } else { addr = (struct sockaddr *)&thlds->spt_address; } #else addr = (struct sockaddr *)&thlds->spt_address; #endif if (stcb != NULL) { net = sctp_findnet(stcb, addr); } else { /* * We increment here since * sctp_findassociation_ep_addr() wil do a * decrement if it finds the stcb as long as * the locked tcb (last argument) is NOT a * TCB.. aka NULL. */ net = NULL; SCTP_INP_INCR_REF(inp); stcb = sctp_findassociation_ep_addr(&inp, addr, &net, NULL, NULL); if (stcb == NULL) { SCTP_INP_DECR_REF(inp); } } if ((stcb != NULL) && (net == NULL)) { #ifdef INET if (addr->sa_family == AF_INET) { struct sockaddr_in *sin; sin = (struct sockaddr_in *)addr; if (sin->sin_addr.s_addr != INADDR_ANY) { error = EINVAL; SCTP_TCB_UNLOCK(stcb); SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error); break; } } else #endif #ifdef INET6 if (addr->sa_family == AF_INET6) { struct sockaddr_in6 *sin6; sin6 = (struct sockaddr_in6 *)addr; if (!IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) { error = EINVAL; SCTP_TCB_UNLOCK(stcb); SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error); break; } } else #endif { error = EAFNOSUPPORT; SCTP_TCB_UNLOCK(stcb); SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error); break; } } if (stcb != NULL) { if (net != NULL) { thlds->spt_pathmaxrxt = net->failure_threshold; thlds->spt_pathpfthld = net->pf_threshold; thlds->spt_pathcpthld = 0xffff; } else { thlds->spt_pathmaxrxt = stcb->asoc.def_net_failure; thlds->spt_pathpfthld = stcb->asoc.def_net_pf_threshold; thlds->spt_pathcpthld = 0xffff; } thlds->spt_assoc_id = sctp_get_associd(stcb); SCTP_TCB_UNLOCK(stcb); } else { if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) || (thlds->spt_assoc_id == SCTP_FUTURE_ASSOC)) { /* Use endpoint defaults */ SCTP_INP_RLOCK(inp); thlds->spt_pathmaxrxt = inp->sctp_ep.def_net_failure; thlds->spt_pathpfthld = inp->sctp_ep.def_net_pf_threshold; thlds->spt_pathcpthld = 0xffff; SCTP_INP_RUNLOCK(inp); } else { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; } } if (error == 0) { *optsize = sizeof(struct sctp_paddrthlds); } break; } case SCTP_REMOTE_UDP_ENCAPS_PORT: { struct sctp_udpencaps *encaps; struct sctp_nets *net; struct sockaddr *addr; #if defined(INET) && defined(INET6) struct sockaddr_in sin_store; #endif SCTP_CHECK_AND_CAST(encaps, optval, struct sctp_udpencaps, *optsize); SCTP_FIND_STCB(inp, stcb, encaps->sue_assoc_id); #if defined(INET) && defined(INET6) if (encaps->sue_address.ss_family == AF_INET6) { struct sockaddr_in6 *sin6; sin6 = (struct sockaddr_in6 *)&encaps->sue_address; if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { in6_sin6_2_sin(&sin_store, sin6); addr = (struct sockaddr *)&sin_store; } else { addr = (struct sockaddr *)&encaps->sue_address; } } else { addr = (struct sockaddr *)&encaps->sue_address; } #else addr = (struct sockaddr *)&encaps->sue_address; #endif if (stcb) { net = sctp_findnet(stcb, addr); } else { /* * We increment here since * sctp_findassociation_ep_addr() wil do a * decrement if it finds the stcb as long as * the locked tcb (last argument) is NOT a * TCB.. aka NULL. */ net = NULL; SCTP_INP_INCR_REF(inp); stcb = sctp_findassociation_ep_addr(&inp, addr, &net, NULL, NULL); if (stcb == NULL) { SCTP_INP_DECR_REF(inp); } } if ((stcb != NULL) && (net == NULL)) { #ifdef INET if (addr->sa_family == AF_INET) { struct sockaddr_in *sin; sin = (struct sockaddr_in *)addr; if (sin->sin_addr.s_addr != INADDR_ANY) { error = EINVAL; SCTP_TCB_UNLOCK(stcb); SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error); break; } } else #endif #ifdef INET6 if (addr->sa_family == AF_INET6) { struct sockaddr_in6 *sin6; sin6 = (struct sockaddr_in6 *)addr; if (!IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) { error = EINVAL; SCTP_TCB_UNLOCK(stcb); SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error); break; } } else #endif { error = EAFNOSUPPORT; SCTP_TCB_UNLOCK(stcb); SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error); break; } } if (stcb != NULL) { if (net) { encaps->sue_port = net->port; } else { encaps->sue_port = stcb->asoc.port; } SCTP_TCB_UNLOCK(stcb); } else { if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) || (encaps->sue_assoc_id == SCTP_FUTURE_ASSOC)) { SCTP_INP_RLOCK(inp); encaps->sue_port = inp->sctp_ep.port; SCTP_INP_RUNLOCK(inp); } else { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; } } if (error == 0) { *optsize = sizeof(struct sctp_udpencaps); } break; } case SCTP_ECN_SUPPORTED: { struct sctp_assoc_value *av; SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, *optsize); SCTP_FIND_STCB(inp, stcb, av->assoc_id); if (stcb) { av->assoc_value = stcb->asoc.ecn_supported; SCTP_TCB_UNLOCK(stcb); } else { if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) || (av->assoc_id == SCTP_FUTURE_ASSOC)) { SCTP_INP_RLOCK(inp); av->assoc_value = inp->ecn_supported; SCTP_INP_RUNLOCK(inp); } else { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; } } if (error == 0) { *optsize = sizeof(struct sctp_assoc_value); } break; } case SCTP_PR_SUPPORTED: { struct sctp_assoc_value *av; SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, *optsize); SCTP_FIND_STCB(inp, stcb, av->assoc_id); if (stcb) { av->assoc_value = stcb->asoc.prsctp_supported; SCTP_TCB_UNLOCK(stcb); } else { if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) || (av->assoc_id == SCTP_FUTURE_ASSOC)) { SCTP_INP_RLOCK(inp); av->assoc_value = inp->prsctp_supported; SCTP_INP_RUNLOCK(inp); } else { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; } } if (error == 0) { *optsize = sizeof(struct sctp_assoc_value); } break; } case SCTP_AUTH_SUPPORTED: { struct sctp_assoc_value *av; SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, *optsize); SCTP_FIND_STCB(inp, stcb, av->assoc_id); if (stcb) { av->assoc_value = stcb->asoc.auth_supported; SCTP_TCB_UNLOCK(stcb); } else { if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) || (av->assoc_id == SCTP_FUTURE_ASSOC)) { SCTP_INP_RLOCK(inp); av->assoc_value = inp->auth_supported; SCTP_INP_RUNLOCK(inp); } else { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; } } if (error == 0) { *optsize = sizeof(struct sctp_assoc_value); } break; } case SCTP_ASCONF_SUPPORTED: { struct sctp_assoc_value *av; SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, *optsize); SCTP_FIND_STCB(inp, stcb, av->assoc_id); if (stcb) { av->assoc_value = stcb->asoc.asconf_supported; SCTP_TCB_UNLOCK(stcb); } else { if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) || (av->assoc_id == SCTP_FUTURE_ASSOC)) { SCTP_INP_RLOCK(inp); av->assoc_value = inp->asconf_supported; SCTP_INP_RUNLOCK(inp); } else { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; } } if (error == 0) { *optsize = sizeof(struct sctp_assoc_value); } break; } case SCTP_RECONFIG_SUPPORTED: { struct sctp_assoc_value *av; SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, *optsize); SCTP_FIND_STCB(inp, stcb, av->assoc_id); if (stcb) { av->assoc_value = stcb->asoc.reconfig_supported; SCTP_TCB_UNLOCK(stcb); } else { if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) || (av->assoc_id == SCTP_FUTURE_ASSOC)) { SCTP_INP_RLOCK(inp); av->assoc_value = inp->reconfig_supported; SCTP_INP_RUNLOCK(inp); } else { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; } } if (error == 0) { *optsize = sizeof(struct sctp_assoc_value); } break; } case SCTP_NRSACK_SUPPORTED: { struct sctp_assoc_value *av; SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, *optsize); SCTP_FIND_STCB(inp, stcb, av->assoc_id); if (stcb) { av->assoc_value = stcb->asoc.nrsack_supported; SCTP_TCB_UNLOCK(stcb); } else { if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) || (av->assoc_id == SCTP_FUTURE_ASSOC)) { SCTP_INP_RLOCK(inp); av->assoc_value = inp->nrsack_supported; SCTP_INP_RUNLOCK(inp); } else { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; } } if (error == 0) { *optsize = sizeof(struct sctp_assoc_value); } break; } case SCTP_PKTDROP_SUPPORTED: { struct sctp_assoc_value *av; SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, *optsize); SCTP_FIND_STCB(inp, stcb, av->assoc_id); if (stcb) { av->assoc_value = stcb->asoc.pktdrop_supported; SCTP_TCB_UNLOCK(stcb); } else { if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) || (av->assoc_id == SCTP_FUTURE_ASSOC)) { SCTP_INP_RLOCK(inp); av->assoc_value = inp->pktdrop_supported; SCTP_INP_RUNLOCK(inp); } else { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; } } if (error == 0) { *optsize = sizeof(struct sctp_assoc_value); } break; } case SCTP_ENABLE_STREAM_RESET: { struct sctp_assoc_value *av; SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, *optsize); SCTP_FIND_STCB(inp, stcb, av->assoc_id); if (stcb) { av->assoc_value = (uint32_t)stcb->asoc.local_strreset_support; SCTP_TCB_UNLOCK(stcb); } else { if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) || (av->assoc_id == SCTP_FUTURE_ASSOC)) { SCTP_INP_RLOCK(inp); av->assoc_value = (uint32_t)inp->local_strreset_support; SCTP_INP_RUNLOCK(inp); } else { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; } } if (error == 0) { *optsize = sizeof(struct sctp_assoc_value); } break; } case SCTP_PR_STREAM_STATUS: { struct sctp_prstatus *sprstat; uint16_t sid; uint16_t policy; SCTP_CHECK_AND_CAST(sprstat, optval, struct sctp_prstatus, *optsize); SCTP_FIND_STCB(inp, stcb, sprstat->sprstat_assoc_id); sid = sprstat->sprstat_sid; policy = sprstat->sprstat_policy; #if defined(SCTP_DETAILED_STR_STATS) if ((stcb != NULL) && (sid < stcb->asoc.streamoutcnt) && (policy != SCTP_PR_SCTP_NONE) && ((policy <= SCTP_PR_SCTP_MAX) || (policy == SCTP_PR_SCTP_ALL))) { if (policy == SCTP_PR_SCTP_ALL) { sprstat->sprstat_abandoned_unsent = stcb->asoc.strmout[sid].abandoned_unsent[0]; sprstat->sprstat_abandoned_sent = stcb->asoc.strmout[sid].abandoned_sent[0]; } else { sprstat->sprstat_abandoned_unsent = stcb->asoc.strmout[sid].abandoned_unsent[policy]; sprstat->sprstat_abandoned_sent = stcb->asoc.strmout[sid].abandoned_sent[policy]; } #else if ((stcb != NULL) && (sid < stcb->asoc.streamoutcnt) && (policy == SCTP_PR_SCTP_ALL)) { sprstat->sprstat_abandoned_unsent = stcb->asoc.strmout[sid].abandoned_unsent[0]; sprstat->sprstat_abandoned_sent = stcb->asoc.strmout[sid].abandoned_sent[0]; #endif SCTP_TCB_UNLOCK(stcb); *optsize = sizeof(struct sctp_prstatus); } else { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; } break; } case SCTP_PR_ASSOC_STATUS: { struct sctp_prstatus *sprstat; uint16_t policy; SCTP_CHECK_AND_CAST(sprstat, optval, struct sctp_prstatus, *optsize); SCTP_FIND_STCB(inp, stcb, sprstat->sprstat_assoc_id); policy = sprstat->sprstat_policy; if ((stcb != NULL) && (policy != SCTP_PR_SCTP_NONE) && ((policy <= SCTP_PR_SCTP_MAX) || (policy == SCTP_PR_SCTP_ALL))) { if (policy == SCTP_PR_SCTP_ALL) { sprstat->sprstat_abandoned_unsent = stcb->asoc.abandoned_unsent[0]; sprstat->sprstat_abandoned_sent = stcb->asoc.abandoned_sent[0]; } else { sprstat->sprstat_abandoned_unsent = stcb->asoc.abandoned_unsent[policy]; sprstat->sprstat_abandoned_sent = stcb->asoc.abandoned_sent[policy]; } SCTP_TCB_UNLOCK(stcb); *optsize = sizeof(struct sctp_prstatus); } else { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; } break; } case SCTP_MAX_CWND: { struct sctp_assoc_value *av; SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, *optsize); SCTP_FIND_STCB(inp, stcb, av->assoc_id); if (stcb) { av->assoc_value = stcb->asoc.max_cwnd; SCTP_TCB_UNLOCK(stcb); } else { if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) || (av->assoc_id == SCTP_FUTURE_ASSOC)) { SCTP_INP_RLOCK(inp); av->assoc_value = inp->max_cwnd; SCTP_INP_RUNLOCK(inp); } else { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; } } if (error == 0) { *optsize = sizeof(struct sctp_assoc_value); } break; } default: SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOPROTOOPT); error = ENOPROTOOPT; break; } /* end switch (sopt->sopt_name) */ if (error) { *optsize = 0; } return (error); } static int sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, void *p) { int error, set_opt; uint32_t *mopt; struct sctp_tcb *stcb = NULL; struct sctp_inpcb *inp = NULL; uint32_t vrf_id; if (optval == NULL) { SCTP_PRINTF("optval is NULL\n"); SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); return (EINVAL); } inp = (struct sctp_inpcb *)so->so_pcb; if (inp == NULL) { SCTP_PRINTF("inp is NULL?\n"); SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); return (EINVAL); } vrf_id = inp->def_vrf_id; error = 0; switch (optname) { case SCTP_NODELAY: case SCTP_AUTOCLOSE: case SCTP_AUTO_ASCONF: case SCTP_EXPLICIT_EOR: case SCTP_DISABLE_FRAGMENTS: case SCTP_USE_EXT_RCVINFO: case SCTP_I_WANT_MAPPED_V4_ADDR: /* copy in the option value */ SCTP_CHECK_AND_CAST(mopt, optval, uint32_t, optsize); set_opt = 0; if (error) break; switch (optname) { case SCTP_DISABLE_FRAGMENTS: set_opt = SCTP_PCB_FLAGS_NO_FRAGMENT; break; case SCTP_AUTO_ASCONF: /* * NOTE: we don't really support this flag */ if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) { /* only valid for bound all sockets */ if ((SCTP_BASE_SYSCTL(sctp_auto_asconf) == 0) && (*mopt != 0)) { /* forbidden by admin */ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EPERM); return (EPERM); } set_opt = SCTP_PCB_FLAGS_AUTO_ASCONF; } else { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); return (EINVAL); } break; case SCTP_EXPLICIT_EOR: set_opt = SCTP_PCB_FLAGS_EXPLICIT_EOR; break; case SCTP_USE_EXT_RCVINFO: set_opt = SCTP_PCB_FLAGS_EXT_RCVINFO; break; case SCTP_I_WANT_MAPPED_V4_ADDR: if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) { set_opt = SCTP_PCB_FLAGS_NEEDS_MAPPED_V4; } else { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); return (EINVAL); } break; case SCTP_NODELAY: set_opt = SCTP_PCB_FLAGS_NODELAY; break; case SCTP_AUTOCLOSE: if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL)) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); return (EINVAL); } set_opt = SCTP_PCB_FLAGS_AUTOCLOSE; /* * The value is in ticks. Note this does not effect * old associations, only new ones. */ inp->sctp_ep.auto_close_time = SEC_TO_TICKS(*mopt); break; } SCTP_INP_WLOCK(inp); if (*mopt != 0) { sctp_feature_on(inp, set_opt); } else { sctp_feature_off(inp, set_opt); } SCTP_INP_WUNLOCK(inp); break; case SCTP_REUSE_PORT: { SCTP_CHECK_AND_CAST(mopt, optval, uint32_t, optsize); if ((inp->sctp_flags & SCTP_PCB_FLAGS_UNBOUND) == 0) { /* Can't set it after we are bound */ error = EINVAL; break; } if ((inp->sctp_flags & SCTP_PCB_FLAGS_UDPTYPE)) { /* Can't do this for a 1-m socket */ error = EINVAL; break; } if (optval) sctp_feature_on(inp, SCTP_PCB_FLAGS_PORTREUSE); else sctp_feature_off(inp, SCTP_PCB_FLAGS_PORTREUSE); break; } case SCTP_PARTIAL_DELIVERY_POINT: { uint32_t *value; SCTP_CHECK_AND_CAST(value, optval, uint32_t, optsize); if (*value > SCTP_SB_LIMIT_RCV(so)) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; break; } inp->partial_delivery_point = *value; break; } case SCTP_FRAGMENT_INTERLEAVE: /* not yet until we re-write sctp_recvmsg() */ { uint32_t *level; SCTP_CHECK_AND_CAST(level, optval, uint32_t, optsize); if (*level == SCTP_FRAG_LEVEL_2) { sctp_feature_on(inp, SCTP_PCB_FLAGS_FRAG_INTERLEAVE); sctp_feature_on(inp, SCTP_PCB_FLAGS_INTERLEAVE_STRMS); } else if (*level == SCTP_FRAG_LEVEL_1) { sctp_feature_on(inp, SCTP_PCB_FLAGS_FRAG_INTERLEAVE); sctp_feature_off(inp, SCTP_PCB_FLAGS_INTERLEAVE_STRMS); } else if (*level == SCTP_FRAG_LEVEL_0) { sctp_feature_off(inp, SCTP_PCB_FLAGS_FRAG_INTERLEAVE); sctp_feature_off(inp, SCTP_PCB_FLAGS_INTERLEAVE_STRMS); } else { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; } break; } case SCTP_INTERLEAVING_SUPPORTED: { struct sctp_assoc_value *av; SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, optsize); SCTP_FIND_STCB(inp, stcb, av->assoc_id); if (stcb) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; SCTP_TCB_UNLOCK(stcb); } else { if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) || (av->assoc_id == SCTP_FUTURE_ASSOC)) { SCTP_INP_WLOCK(inp); if (av->assoc_value == 0) { inp->idata_supported = 0; } else { if ((sctp_is_feature_on(inp, SCTP_PCB_FLAGS_FRAG_INTERLEAVE)) && (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_INTERLEAVE_STRMS))) { inp->idata_supported = 1; } else { /* * Must have Frag * interleave and * stream interleave * on */ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; } } SCTP_INP_WUNLOCK(inp); } else { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; } } break; } case SCTP_CMT_ON_OFF: if (SCTP_BASE_SYSCTL(sctp_cmt_on_off)) { struct sctp_assoc_value *av; SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, optsize); if (av->assoc_value > SCTP_CMT_MAX) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; break; } SCTP_FIND_STCB(inp, stcb, av->assoc_id); if (stcb) { stcb->asoc.sctp_cmt_on_off = av->assoc_value; SCTP_TCB_UNLOCK(stcb); } else { if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) || (av->assoc_id == SCTP_FUTURE_ASSOC) || (av->assoc_id == SCTP_ALL_ASSOC)) { SCTP_INP_WLOCK(inp); inp->sctp_cmt_on_off = av->assoc_value; SCTP_INP_WUNLOCK(inp); } if ((av->assoc_id == SCTP_CURRENT_ASSOC) || (av->assoc_id == SCTP_ALL_ASSOC)) { SCTP_INP_RLOCK(inp); LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) { SCTP_TCB_LOCK(stcb); stcb->asoc.sctp_cmt_on_off = av->assoc_value; SCTP_TCB_UNLOCK(stcb); } SCTP_INP_RUNLOCK(inp); } } } else { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOPROTOOPT); error = ENOPROTOOPT; } break; case SCTP_PLUGGABLE_CC: { struct sctp_assoc_value *av; struct sctp_nets *net; SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, optsize); if ((av->assoc_value != SCTP_CC_RFC2581) && (av->assoc_value != SCTP_CC_HSTCP) && (av->assoc_value != SCTP_CC_HTCP) && (av->assoc_value != SCTP_CC_RTCC)) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; break; } SCTP_FIND_STCB(inp, stcb, av->assoc_id); if (stcb) { stcb->asoc.cc_functions = sctp_cc_functions[av->assoc_value]; stcb->asoc.congestion_control_module = av->assoc_value; if (stcb->asoc.cc_functions.sctp_set_initial_cc_param != NULL) { TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) { stcb->asoc.cc_functions.sctp_set_initial_cc_param(stcb, net); } } SCTP_TCB_UNLOCK(stcb); } else { if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) || (av->assoc_id == SCTP_FUTURE_ASSOC) || (av->assoc_id == SCTP_ALL_ASSOC)) { SCTP_INP_WLOCK(inp); inp->sctp_ep.sctp_default_cc_module = av->assoc_value; SCTP_INP_WUNLOCK(inp); } if ((av->assoc_id == SCTP_CURRENT_ASSOC) || (av->assoc_id == SCTP_ALL_ASSOC)) { SCTP_INP_RLOCK(inp); LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) { SCTP_TCB_LOCK(stcb); stcb->asoc.cc_functions = sctp_cc_functions[av->assoc_value]; stcb->asoc.congestion_control_module = av->assoc_value; if (stcb->asoc.cc_functions.sctp_set_initial_cc_param != NULL) { TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) { stcb->asoc.cc_functions.sctp_set_initial_cc_param(stcb, net); } } SCTP_TCB_UNLOCK(stcb); } SCTP_INP_RUNLOCK(inp); } } break; } case SCTP_CC_OPTION: { struct sctp_cc_option *cc_opt; SCTP_CHECK_AND_CAST(cc_opt, optval, struct sctp_cc_option, optsize); SCTP_FIND_STCB(inp, stcb, cc_opt->aid_value.assoc_id); if (stcb == NULL) { if (cc_opt->aid_value.assoc_id == SCTP_CURRENT_ASSOC) { SCTP_INP_RLOCK(inp); LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) { SCTP_TCB_LOCK(stcb); if (stcb->asoc.cc_functions.sctp_cwnd_socket_option) { (*stcb->asoc.cc_functions.sctp_cwnd_socket_option) (stcb, 1, cc_opt); } SCTP_TCB_UNLOCK(stcb); } SCTP_INP_RUNLOCK(inp); } else { error = EINVAL; } } else { if (stcb->asoc.cc_functions.sctp_cwnd_socket_option == NULL) { error = ENOTSUP; } else { error = (*stcb->asoc.cc_functions.sctp_cwnd_socket_option) (stcb, 1, cc_opt); } SCTP_TCB_UNLOCK(stcb); } break; } case SCTP_PLUGGABLE_SS: { struct sctp_assoc_value *av; SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, optsize); if ((av->assoc_value != SCTP_SS_DEFAULT) && (av->assoc_value != SCTP_SS_ROUND_ROBIN) && (av->assoc_value != SCTP_SS_ROUND_ROBIN_PACKET) && (av->assoc_value != SCTP_SS_PRIORITY) && (av->assoc_value != SCTP_SS_FAIR_BANDWITH) && (av->assoc_value != SCTP_SS_FIRST_COME)) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; break; } SCTP_FIND_STCB(inp, stcb, av->assoc_id); if (stcb) { stcb->asoc.ss_functions.sctp_ss_clear(stcb, &stcb->asoc, 1, 1); stcb->asoc.ss_functions = sctp_ss_functions[av->assoc_value]; stcb->asoc.stream_scheduling_module = av->assoc_value; stcb->asoc.ss_functions.sctp_ss_init(stcb, &stcb->asoc, 1); SCTP_TCB_UNLOCK(stcb); } else { if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) || (av->assoc_id == SCTP_FUTURE_ASSOC) || (av->assoc_id == SCTP_ALL_ASSOC)) { SCTP_INP_WLOCK(inp); inp->sctp_ep.sctp_default_ss_module = av->assoc_value; SCTP_INP_WUNLOCK(inp); } if ((av->assoc_id == SCTP_CURRENT_ASSOC) || (av->assoc_id == SCTP_ALL_ASSOC)) { SCTP_INP_RLOCK(inp); LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) { SCTP_TCB_LOCK(stcb); stcb->asoc.ss_functions.sctp_ss_clear(stcb, &stcb->asoc, 1, 1); stcb->asoc.ss_functions = sctp_ss_functions[av->assoc_value]; stcb->asoc.stream_scheduling_module = av->assoc_value; stcb->asoc.ss_functions.sctp_ss_init(stcb, &stcb->asoc, 1); SCTP_TCB_UNLOCK(stcb); } SCTP_INP_RUNLOCK(inp); } } break; } case SCTP_SS_VALUE: { struct sctp_stream_value *av; SCTP_CHECK_AND_CAST(av, optval, struct sctp_stream_value, optsize); SCTP_FIND_STCB(inp, stcb, av->assoc_id); if (stcb) { if ((av->stream_id >= stcb->asoc.streamoutcnt) || (stcb->asoc.ss_functions.sctp_ss_set_value(stcb, &stcb->asoc, &stcb->asoc.strmout[av->stream_id], av->stream_value) < 0)) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; } SCTP_TCB_UNLOCK(stcb); } else { if (av->assoc_id == SCTP_CURRENT_ASSOC) { SCTP_INP_RLOCK(inp); LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) { SCTP_TCB_LOCK(stcb); if (av->stream_id < stcb->asoc.streamoutcnt) { stcb->asoc.ss_functions.sctp_ss_set_value(stcb, &stcb->asoc, &stcb->asoc.strmout[av->stream_id], av->stream_value); } SCTP_TCB_UNLOCK(stcb); } SCTP_INP_RUNLOCK(inp); } else { /* * Can't set stream value without * association */ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; } } break; } case SCTP_CLR_STAT_LOG: SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EOPNOTSUPP); error = EOPNOTSUPP; break; case SCTP_CONTEXT: { struct sctp_assoc_value *av; SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, optsize); SCTP_FIND_STCB(inp, stcb, av->assoc_id); if (stcb) { stcb->asoc.context = av->assoc_value; SCTP_TCB_UNLOCK(stcb); } else { if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) || (av->assoc_id == SCTP_FUTURE_ASSOC) || (av->assoc_id == SCTP_ALL_ASSOC)) { SCTP_INP_WLOCK(inp); inp->sctp_context = av->assoc_value; SCTP_INP_WUNLOCK(inp); } if ((av->assoc_id == SCTP_CURRENT_ASSOC) || (av->assoc_id == SCTP_ALL_ASSOC)) { SCTP_INP_RLOCK(inp); LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) { SCTP_TCB_LOCK(stcb); stcb->asoc.context = av->assoc_value; SCTP_TCB_UNLOCK(stcb); } SCTP_INP_RUNLOCK(inp); } } break; } case SCTP_VRF_ID: { uint32_t *default_vrfid; SCTP_CHECK_AND_CAST(default_vrfid, optval, uint32_t, optsize); if (*default_vrfid > SCTP_MAX_VRF_ID) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; break; } inp->def_vrf_id = *default_vrfid; break; } case SCTP_DEL_VRF_ID: { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EOPNOTSUPP); error = EOPNOTSUPP; break; } case SCTP_ADD_VRF_ID: { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EOPNOTSUPP); error = EOPNOTSUPP; break; } case SCTP_DELAYED_SACK: { struct sctp_sack_info *sack; SCTP_CHECK_AND_CAST(sack, optval, struct sctp_sack_info, optsize); SCTP_FIND_STCB(inp, stcb, sack->sack_assoc_id); if (sack->sack_delay) { if (sack->sack_delay > SCTP_MAX_SACK_DELAY) sack->sack_delay = SCTP_MAX_SACK_DELAY; if (MSEC_TO_TICKS(sack->sack_delay) < 1) { sack->sack_delay = TICKS_TO_MSEC(1); } } if (stcb) { if (sack->sack_delay) { stcb->asoc.delayed_ack = sack->sack_delay; } if (sack->sack_freq) { stcb->asoc.sack_freq = sack->sack_freq; } SCTP_TCB_UNLOCK(stcb); } else { if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) || (sack->sack_assoc_id == SCTP_FUTURE_ASSOC) || (sack->sack_assoc_id == SCTP_ALL_ASSOC)) { SCTP_INP_WLOCK(inp); if (sack->sack_delay) { inp->sctp_ep.sctp_timeoutticks[SCTP_TIMER_RECV] = MSEC_TO_TICKS(sack->sack_delay); } if (sack->sack_freq) { inp->sctp_ep.sctp_sack_freq = sack->sack_freq; } SCTP_INP_WUNLOCK(inp); } if ((sack->sack_assoc_id == SCTP_CURRENT_ASSOC) || (sack->sack_assoc_id == SCTP_ALL_ASSOC)) { SCTP_INP_RLOCK(inp); LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) { SCTP_TCB_LOCK(stcb); if (sack->sack_delay) { stcb->asoc.delayed_ack = sack->sack_delay; } if (sack->sack_freq) { stcb->asoc.sack_freq = sack->sack_freq; } SCTP_TCB_UNLOCK(stcb); } SCTP_INP_RUNLOCK(inp); } } break; } case SCTP_AUTH_CHUNK: { struct sctp_authchunk *sauth; SCTP_CHECK_AND_CAST(sauth, optval, struct sctp_authchunk, optsize); SCTP_INP_WLOCK(inp); if (sctp_auth_add_chunk(sauth->sauth_chunk, inp->sctp_ep.local_auth_chunks)) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; } SCTP_INP_WUNLOCK(inp); break; } case SCTP_AUTH_KEY: { struct sctp_authkey *sca; struct sctp_keyhead *shared_keys; sctp_sharedkey_t *shared_key; sctp_key_t *key = NULL; size_t size; SCTP_CHECK_AND_CAST(sca, optval, struct sctp_authkey, optsize); if (sca->sca_keylength == 0) { size = optsize - sizeof(struct sctp_authkey); } else { if (sca->sca_keylength + sizeof(struct sctp_authkey) <= optsize) { size = sca->sca_keylength; } else { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; break; } } SCTP_FIND_STCB(inp, stcb, sca->sca_assoc_id); if (stcb) { shared_keys = &stcb->asoc.shared_keys; /* clear the cached keys for this key id */ sctp_clear_cachedkeys(stcb, sca->sca_keynumber); /* * create the new shared key and * insert/replace it */ if (size > 0) { key = sctp_set_key(sca->sca_key, (uint32_t)size); if (key == NULL) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOMEM); error = ENOMEM; SCTP_TCB_UNLOCK(stcb); break; } } shared_key = sctp_alloc_sharedkey(); if (shared_key == NULL) { sctp_free_key(key); SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOMEM); error = ENOMEM; SCTP_TCB_UNLOCK(stcb); break; } shared_key->key = key; shared_key->keyid = sca->sca_keynumber; error = sctp_insert_sharedkey(shared_keys, shared_key); SCTP_TCB_UNLOCK(stcb); } else { if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) || (sca->sca_assoc_id == SCTP_FUTURE_ASSOC) || (sca->sca_assoc_id == SCTP_ALL_ASSOC)) { SCTP_INP_WLOCK(inp); shared_keys = &inp->sctp_ep.shared_keys; /* * clear the cached keys on all * assocs for this key id */ sctp_clear_cachedkeys_ep(inp, sca->sca_keynumber); /* * create the new shared key and * insert/replace it */ if (size > 0) { key = sctp_set_key(sca->sca_key, (uint32_t)size); if (key == NULL) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOMEM); error = ENOMEM; SCTP_INP_WUNLOCK(inp); break; } } shared_key = sctp_alloc_sharedkey(); if (shared_key == NULL) { sctp_free_key(key); SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOMEM); error = ENOMEM; SCTP_INP_WUNLOCK(inp); break; } shared_key->key = key; shared_key->keyid = sca->sca_keynumber; error = sctp_insert_sharedkey(shared_keys, shared_key); SCTP_INP_WUNLOCK(inp); } if ((sca->sca_assoc_id == SCTP_CURRENT_ASSOC) || (sca->sca_assoc_id == SCTP_ALL_ASSOC)) { SCTP_INP_RLOCK(inp); LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) { SCTP_TCB_LOCK(stcb); shared_keys = &stcb->asoc.shared_keys; /* * clear the cached keys for * this key id */ sctp_clear_cachedkeys(stcb, sca->sca_keynumber); /* * create the new shared key * and insert/replace it */ if (size > 0) { key = sctp_set_key(sca->sca_key, (uint32_t)size); if (key == NULL) { SCTP_TCB_UNLOCK(stcb); continue; } } shared_key = sctp_alloc_sharedkey(); if (shared_key == NULL) { sctp_free_key(key); SCTP_TCB_UNLOCK(stcb); continue; } shared_key->key = key; shared_key->keyid = sca->sca_keynumber; error = sctp_insert_sharedkey(shared_keys, shared_key); SCTP_TCB_UNLOCK(stcb); } SCTP_INP_RUNLOCK(inp); } } break; } case SCTP_HMAC_IDENT: { struct sctp_hmacalgo *shmac; sctp_hmaclist_t *hmaclist; uint16_t hmacid; uint32_t i; SCTP_CHECK_AND_CAST(shmac, optval, struct sctp_hmacalgo, optsize); if ((optsize < sizeof(struct sctp_hmacalgo) + shmac->shmac_number_of_idents * sizeof(uint16_t)) || (shmac->shmac_number_of_idents > 0xffff)) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; break; } hmaclist = sctp_alloc_hmaclist((uint16_t)shmac->shmac_number_of_idents); if (hmaclist == NULL) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOMEM); error = ENOMEM; break; } for (i = 0; i < shmac->shmac_number_of_idents; i++) { hmacid = shmac->shmac_idents[i]; if (sctp_auth_add_hmacid(hmaclist, hmacid)) { /* invalid HMACs were found */ ; SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; sctp_free_hmaclist(hmaclist); goto sctp_set_hmac_done; } } for (i = 0; i < hmaclist->num_algo; i++) { if (hmaclist->hmac[i] == SCTP_AUTH_HMAC_ID_SHA1) { /* already in list */ break; } } if (i == hmaclist->num_algo) { /* not found in list */ sctp_free_hmaclist(hmaclist); SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; break; } /* set it on the endpoint */ SCTP_INP_WLOCK(inp); if (inp->sctp_ep.local_hmacs) sctp_free_hmaclist(inp->sctp_ep.local_hmacs); inp->sctp_ep.local_hmacs = hmaclist; SCTP_INP_WUNLOCK(inp); sctp_set_hmac_done: break; } case SCTP_AUTH_ACTIVE_KEY: { struct sctp_authkeyid *scact; SCTP_CHECK_AND_CAST(scact, optval, struct sctp_authkeyid, optsize); SCTP_FIND_STCB(inp, stcb, scact->scact_assoc_id); /* set the active key on the right place */ if (stcb) { /* set the active key on the assoc */ if (sctp_auth_setactivekey(stcb, scact->scact_keynumber)) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; } SCTP_TCB_UNLOCK(stcb); } else { if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) || (scact->scact_assoc_id == SCTP_FUTURE_ASSOC) || (scact->scact_assoc_id == SCTP_ALL_ASSOC)) { SCTP_INP_WLOCK(inp); if (sctp_auth_setactivekey_ep(inp, scact->scact_keynumber)) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; } SCTP_INP_WUNLOCK(inp); } if ((scact->scact_assoc_id == SCTP_CURRENT_ASSOC) || (scact->scact_assoc_id == SCTP_ALL_ASSOC)) { SCTP_INP_RLOCK(inp); LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) { SCTP_TCB_LOCK(stcb); sctp_auth_setactivekey(stcb, scact->scact_keynumber); SCTP_TCB_UNLOCK(stcb); } SCTP_INP_RUNLOCK(inp); } } break; } case SCTP_AUTH_DELETE_KEY: { struct sctp_authkeyid *scdel; SCTP_CHECK_AND_CAST(scdel, optval, struct sctp_authkeyid, optsize); SCTP_FIND_STCB(inp, stcb, scdel->scact_assoc_id); /* delete the key from the right place */ if (stcb) { if (sctp_delete_sharedkey(stcb, scdel->scact_keynumber)) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; } SCTP_TCB_UNLOCK(stcb); } else { if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) || (scdel->scact_assoc_id == SCTP_FUTURE_ASSOC) || (scdel->scact_assoc_id == SCTP_ALL_ASSOC)) { SCTP_INP_WLOCK(inp); if (sctp_delete_sharedkey_ep(inp, scdel->scact_keynumber)) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; } SCTP_INP_WUNLOCK(inp); } if ((scdel->scact_assoc_id == SCTP_CURRENT_ASSOC) || (scdel->scact_assoc_id == SCTP_ALL_ASSOC)) { SCTP_INP_RLOCK(inp); LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) { SCTP_TCB_LOCK(stcb); sctp_delete_sharedkey(stcb, scdel->scact_keynumber); SCTP_TCB_UNLOCK(stcb); } SCTP_INP_RUNLOCK(inp); } } break; } case SCTP_AUTH_DEACTIVATE_KEY: { struct sctp_authkeyid *keyid; SCTP_CHECK_AND_CAST(keyid, optval, struct sctp_authkeyid, optsize); SCTP_FIND_STCB(inp, stcb, keyid->scact_assoc_id); /* deactivate the key from the right place */ if (stcb) { if (sctp_deact_sharedkey(stcb, keyid->scact_keynumber)) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; } SCTP_TCB_UNLOCK(stcb); } else { if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) || (keyid->scact_assoc_id == SCTP_FUTURE_ASSOC) || (keyid->scact_assoc_id == SCTP_ALL_ASSOC)) { SCTP_INP_WLOCK(inp); if (sctp_deact_sharedkey_ep(inp, keyid->scact_keynumber)) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; } SCTP_INP_WUNLOCK(inp); } if ((keyid->scact_assoc_id == SCTP_CURRENT_ASSOC) || (keyid->scact_assoc_id == SCTP_ALL_ASSOC)) { SCTP_INP_RLOCK(inp); LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) { SCTP_TCB_LOCK(stcb); sctp_deact_sharedkey(stcb, keyid->scact_keynumber); SCTP_TCB_UNLOCK(stcb); } SCTP_INP_RUNLOCK(inp); } } break; } case SCTP_ENABLE_STREAM_RESET: { struct sctp_assoc_value *av; SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, optsize); if (av->assoc_value & (~SCTP_ENABLE_VALUE_MASK)) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; break; } SCTP_FIND_STCB(inp, stcb, av->assoc_id); if (stcb) { stcb->asoc.local_strreset_support = (uint8_t)av->assoc_value; SCTP_TCB_UNLOCK(stcb); } else { if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) || (av->assoc_id == SCTP_FUTURE_ASSOC) || (av->assoc_id == SCTP_ALL_ASSOC)) { SCTP_INP_WLOCK(inp); inp->local_strreset_support = (uint8_t)av->assoc_value; SCTP_INP_WUNLOCK(inp); } if ((av->assoc_id == SCTP_CURRENT_ASSOC) || (av->assoc_id == SCTP_ALL_ASSOC)) { SCTP_INP_RLOCK(inp); LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) { SCTP_TCB_LOCK(stcb); stcb->asoc.local_strreset_support = (uint8_t)av->assoc_value; SCTP_TCB_UNLOCK(stcb); } SCTP_INP_RUNLOCK(inp); } } break; } case SCTP_RESET_STREAMS: { struct sctp_reset_streams *strrst; int i, send_out = 0; int send_in = 0; SCTP_CHECK_AND_CAST(strrst, optval, struct sctp_reset_streams, optsize); SCTP_FIND_STCB(inp, stcb, strrst->srs_assoc_id); if (stcb == NULL) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOENT); error = ENOENT; break; } if (stcb->asoc.reconfig_supported == 0) { /* * Peer does not support the chunk type. */ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EOPNOTSUPP); error = EOPNOTSUPP; SCTP_TCB_UNLOCK(stcb); break; } if (sizeof(struct sctp_reset_streams) + strrst->srs_number_streams * sizeof(uint16_t) > optsize) { error = EINVAL; SCTP_TCB_UNLOCK(stcb); break; } if (strrst->srs_flags & SCTP_STREAM_RESET_INCOMING) { send_in = 1; if (stcb->asoc.stream_reset_outstanding) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EALREADY); error = EALREADY; SCTP_TCB_UNLOCK(stcb); break; } } if (strrst->srs_flags & SCTP_STREAM_RESET_OUTGOING) { send_out = 1; } if ((strrst->srs_number_streams > SCTP_MAX_STREAMS_AT_ONCE_RESET) && send_in) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOMEM); error = ENOMEM; SCTP_TCB_UNLOCK(stcb); break; } if ((send_in == 0) && (send_out == 0)) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; SCTP_TCB_UNLOCK(stcb); break; } for (i = 0; i < strrst->srs_number_streams; i++) { if ((send_in) && (strrst->srs_stream_list[i] > stcb->asoc.streamincnt)) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; break; } if ((send_out) && (strrst->srs_stream_list[i] > stcb->asoc.streamoutcnt)) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; break; } } if (error) { SCTP_TCB_UNLOCK(stcb); break; } if (send_out) { int cnt; uint16_t strm; if (strrst->srs_number_streams) { for (i = 0, cnt = 0; i < strrst->srs_number_streams; i++) { strm = strrst->srs_stream_list[i]; if (stcb->asoc.strmout[strm].state == SCTP_STREAM_OPEN) { stcb->asoc.strmout[strm].state = SCTP_STREAM_RESET_PENDING; cnt++; } } } else { /* Its all */ for (i = 0, cnt = 0; i < stcb->asoc.streamoutcnt; i++) { if (stcb->asoc.strmout[i].state == SCTP_STREAM_OPEN) { stcb->asoc.strmout[i].state = SCTP_STREAM_RESET_PENDING; cnt++; } } } } if (send_in) { error = sctp_send_str_reset_req(stcb, strrst->srs_number_streams, strrst->srs_stream_list, send_in, 0, 0, 0, 0, 0); } else { error = sctp_send_stream_reset_out_if_possible(stcb, SCTP_SO_LOCKED); } if (error == 0) { sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_STRRST_REQ, SCTP_SO_LOCKED); } else { /* * For outgoing streams don't report any * problems in sending the request to the * application. XXX: Double check resetting * incoming streams. */ error = 0; } SCTP_TCB_UNLOCK(stcb); break; } case SCTP_ADD_STREAMS: { struct sctp_add_streams *stradd; uint8_t addstream = 0; uint16_t add_o_strmcnt = 0; uint16_t add_i_strmcnt = 0; SCTP_CHECK_AND_CAST(stradd, optval, struct sctp_add_streams, optsize); SCTP_FIND_STCB(inp, stcb, stradd->sas_assoc_id); if (stcb == NULL) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOENT); error = ENOENT; break; } if (stcb->asoc.reconfig_supported == 0) { /* * Peer does not support the chunk type. */ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EOPNOTSUPP); error = EOPNOTSUPP; SCTP_TCB_UNLOCK(stcb); break; } if (stcb->asoc.stream_reset_outstanding) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EALREADY); error = EALREADY; SCTP_TCB_UNLOCK(stcb); break; } if ((stradd->sas_outstrms == 0) && (stradd->sas_instrms == 0)) { error = EINVAL; goto skip_stuff; } if (stradd->sas_outstrms) { addstream = 1; /* We allocate here */ add_o_strmcnt = stradd->sas_outstrms; if ((((int)add_o_strmcnt) + ((int)stcb->asoc.streamoutcnt)) > 0x0000ffff) { /* You can't have more than 64k */ error = EINVAL; goto skip_stuff; } } if (stradd->sas_instrms) { int cnt; addstream |= 2; /* * We allocate inside * sctp_send_str_reset_req() */ add_i_strmcnt = stradd->sas_instrms; cnt = add_i_strmcnt; cnt += stcb->asoc.streamincnt; if (cnt > 0x0000ffff) { /* You can't have more than 64k */ error = EINVAL; goto skip_stuff; } if (cnt > (int)stcb->asoc.max_inbound_streams) { /* More than you are allowed */ error = EINVAL; goto skip_stuff; } } error = sctp_send_str_reset_req(stcb, 0, NULL, 0, 0, addstream, add_o_strmcnt, add_i_strmcnt, 0); sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_STRRST_REQ, SCTP_SO_LOCKED); skip_stuff: SCTP_TCB_UNLOCK(stcb); break; } case SCTP_RESET_ASSOC: { int i; uint32_t *value; SCTP_CHECK_AND_CAST(value, optval, uint32_t, optsize); SCTP_FIND_STCB(inp, stcb, (sctp_assoc_t)*value); if (stcb == NULL) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOENT); error = ENOENT; break; } if (stcb->asoc.reconfig_supported == 0) { /* * Peer does not support the chunk type. */ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EOPNOTSUPP); error = EOPNOTSUPP; SCTP_TCB_UNLOCK(stcb); break; } if (stcb->asoc.stream_reset_outstanding) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EALREADY); error = EALREADY; SCTP_TCB_UNLOCK(stcb); break; } /* * Is there any data pending in the send or sent * queues? */ if (!TAILQ_EMPTY(&stcb->asoc.send_queue) || !TAILQ_EMPTY(&stcb->asoc.sent_queue)) { busy_out: error = EBUSY; SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error); SCTP_TCB_UNLOCK(stcb); break; } /* Do any streams have data queued? */ for (i = 0; i < stcb->asoc.streamoutcnt; i++) { if (!TAILQ_EMPTY(&stcb->asoc.strmout[i].outqueue)) { goto busy_out; } } error = sctp_send_str_reset_req(stcb, 0, NULL, 0, 1, 0, 0, 0, 0); sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_STRRST_REQ, SCTP_SO_LOCKED); SCTP_TCB_UNLOCK(stcb); break; } case SCTP_CONNECT_X: if (optsize < (sizeof(int) + sizeof(struct sockaddr_in))) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; break; } error = sctp_do_connect_x(so, inp, optval, optsize, p, 0); break; case SCTP_CONNECT_X_DELAYED: if (optsize < (sizeof(int) + sizeof(struct sockaddr_in))) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; break; } error = sctp_do_connect_x(so, inp, optval, optsize, p, 1); break; case SCTP_CONNECT_X_COMPLETE: { struct sockaddr *sa; /* FIXME MT: check correct? */ SCTP_CHECK_AND_CAST(sa, optval, struct sockaddr, optsize); /* find tcb */ if (inp->sctp_flags & SCTP_PCB_FLAGS_CONNECTED) { SCTP_INP_RLOCK(inp); stcb = LIST_FIRST(&inp->sctp_asoc_list); if (stcb) { SCTP_TCB_LOCK(stcb); } SCTP_INP_RUNLOCK(inp); } else { /* * We increment here since * sctp_findassociation_ep_addr() wil do a * decrement if it finds the stcb as long as * the locked tcb (last argument) is NOT a * TCB.. aka NULL. */ SCTP_INP_INCR_REF(inp); stcb = sctp_findassociation_ep_addr(&inp, sa, NULL, NULL, NULL); if (stcb == NULL) { SCTP_INP_DECR_REF(inp); } } if (stcb == NULL) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOENT); error = ENOENT; break; } if (stcb->asoc.delayed_connection == 1) { stcb->asoc.delayed_connection = 0; (void)SCTP_GETTIME_TIMEVAL(&stcb->asoc.time_entered); sctp_timer_stop(SCTP_TIMER_TYPE_INIT, inp, stcb, stcb->asoc.primary_destination, SCTP_FROM_SCTP_USRREQ + SCTP_LOC_8); sctp_send_initiate(inp, stcb, SCTP_SO_LOCKED); } else { /* * already expired or did not use delayed * connectx */ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EALREADY); error = EALREADY; } SCTP_TCB_UNLOCK(stcb); break; } case SCTP_MAX_BURST: { struct sctp_assoc_value *av; SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, optsize); SCTP_FIND_STCB(inp, stcb, av->assoc_id); if (stcb) { stcb->asoc.max_burst = av->assoc_value; SCTP_TCB_UNLOCK(stcb); } else { if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) || (av->assoc_id == SCTP_FUTURE_ASSOC) || (av->assoc_id == SCTP_ALL_ASSOC)) { SCTP_INP_WLOCK(inp); inp->sctp_ep.max_burst = av->assoc_value; SCTP_INP_WUNLOCK(inp); } if ((av->assoc_id == SCTP_CURRENT_ASSOC) || (av->assoc_id == SCTP_ALL_ASSOC)) { SCTP_INP_RLOCK(inp); LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) { SCTP_TCB_LOCK(stcb); stcb->asoc.max_burst = av->assoc_value; SCTP_TCB_UNLOCK(stcb); } SCTP_INP_RUNLOCK(inp); } } break; } case SCTP_MAXSEG: { struct sctp_assoc_value *av; int ovh; SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, optsize); SCTP_FIND_STCB(inp, stcb, av->assoc_id); if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) { ovh = SCTP_MED_OVERHEAD; } else { ovh = SCTP_MED_V4_OVERHEAD; } if (stcb) { if (av->assoc_value) { stcb->asoc.sctp_frag_point = (av->assoc_value + ovh); } else { stcb->asoc.sctp_frag_point = SCTP_DEFAULT_MAXSEGMENT; } SCTP_TCB_UNLOCK(stcb); } else { if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) || (av->assoc_id == SCTP_FUTURE_ASSOC)) { SCTP_INP_WLOCK(inp); /* * FIXME MT: I think this is not in * tune with the API ID */ if (av->assoc_value) { inp->sctp_frag_point = (av->assoc_value + ovh); } else { inp->sctp_frag_point = SCTP_DEFAULT_MAXSEGMENT; } SCTP_INP_WUNLOCK(inp); } else { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; } } break; } case SCTP_EVENTS: { struct sctp_event_subscribe *events; SCTP_CHECK_AND_CAST(events, optval, struct sctp_event_subscribe, optsize); SCTP_INP_WLOCK(inp); if (events->sctp_data_io_event) { sctp_feature_on(inp, SCTP_PCB_FLAGS_RECVDATAIOEVNT); } else { sctp_feature_off(inp, SCTP_PCB_FLAGS_RECVDATAIOEVNT); } if (events->sctp_association_event) { sctp_feature_on(inp, SCTP_PCB_FLAGS_RECVASSOCEVNT); } else { sctp_feature_off(inp, SCTP_PCB_FLAGS_RECVASSOCEVNT); } if (events->sctp_address_event) { sctp_feature_on(inp, SCTP_PCB_FLAGS_RECVPADDREVNT); } else { sctp_feature_off(inp, SCTP_PCB_FLAGS_RECVPADDREVNT); } if (events->sctp_send_failure_event) { sctp_feature_on(inp, SCTP_PCB_FLAGS_RECVSENDFAILEVNT); } else { sctp_feature_off(inp, SCTP_PCB_FLAGS_RECVSENDFAILEVNT); } if (events->sctp_peer_error_event) { sctp_feature_on(inp, SCTP_PCB_FLAGS_RECVPEERERR); } else { sctp_feature_off(inp, SCTP_PCB_FLAGS_RECVPEERERR); } if (events->sctp_shutdown_event) { sctp_feature_on(inp, SCTP_PCB_FLAGS_RECVSHUTDOWNEVNT); } else { sctp_feature_off(inp, SCTP_PCB_FLAGS_RECVSHUTDOWNEVNT); } if (events->sctp_partial_delivery_event) { sctp_feature_on(inp, SCTP_PCB_FLAGS_PDAPIEVNT); } else { sctp_feature_off(inp, SCTP_PCB_FLAGS_PDAPIEVNT); } if (events->sctp_adaptation_layer_event) { sctp_feature_on(inp, SCTP_PCB_FLAGS_ADAPTATIONEVNT); } else { sctp_feature_off(inp, SCTP_PCB_FLAGS_ADAPTATIONEVNT); } if (events->sctp_authentication_event) { sctp_feature_on(inp, SCTP_PCB_FLAGS_AUTHEVNT); } else { sctp_feature_off(inp, SCTP_PCB_FLAGS_AUTHEVNT); } if (events->sctp_sender_dry_event) { sctp_feature_on(inp, SCTP_PCB_FLAGS_DRYEVNT); } else { sctp_feature_off(inp, SCTP_PCB_FLAGS_DRYEVNT); } if (events->sctp_stream_reset_event) { sctp_feature_on(inp, SCTP_PCB_FLAGS_STREAM_RESETEVNT); } else { sctp_feature_off(inp, SCTP_PCB_FLAGS_STREAM_RESETEVNT); } SCTP_INP_WUNLOCK(inp); SCTP_INP_RLOCK(inp); LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) { SCTP_TCB_LOCK(stcb); if (events->sctp_association_event) { sctp_stcb_feature_on(inp, stcb, SCTP_PCB_FLAGS_RECVASSOCEVNT); } else { sctp_stcb_feature_off(inp, stcb, SCTP_PCB_FLAGS_RECVASSOCEVNT); } if (events->sctp_address_event) { sctp_stcb_feature_on(inp, stcb, SCTP_PCB_FLAGS_RECVPADDREVNT); } else { sctp_stcb_feature_off(inp, stcb, SCTP_PCB_FLAGS_RECVPADDREVNT); } if (events->sctp_send_failure_event) { sctp_stcb_feature_on(inp, stcb, SCTP_PCB_FLAGS_RECVSENDFAILEVNT); } else { sctp_stcb_feature_off(inp, stcb, SCTP_PCB_FLAGS_RECVSENDFAILEVNT); } if (events->sctp_peer_error_event) { sctp_stcb_feature_on(inp, stcb, SCTP_PCB_FLAGS_RECVPEERERR); } else { sctp_stcb_feature_off(inp, stcb, SCTP_PCB_FLAGS_RECVPEERERR); } if (events->sctp_shutdown_event) { sctp_stcb_feature_on(inp, stcb, SCTP_PCB_FLAGS_RECVSHUTDOWNEVNT); } else { sctp_stcb_feature_off(inp, stcb, SCTP_PCB_FLAGS_RECVSHUTDOWNEVNT); } if (events->sctp_partial_delivery_event) { sctp_stcb_feature_on(inp, stcb, SCTP_PCB_FLAGS_PDAPIEVNT); } else { sctp_stcb_feature_off(inp, stcb, SCTP_PCB_FLAGS_PDAPIEVNT); } if (events->sctp_adaptation_layer_event) { sctp_stcb_feature_on(inp, stcb, SCTP_PCB_FLAGS_ADAPTATIONEVNT); } else { sctp_stcb_feature_off(inp, stcb, SCTP_PCB_FLAGS_ADAPTATIONEVNT); } if (events->sctp_authentication_event) { sctp_stcb_feature_on(inp, stcb, SCTP_PCB_FLAGS_AUTHEVNT); } else { sctp_stcb_feature_off(inp, stcb, SCTP_PCB_FLAGS_AUTHEVNT); } if (events->sctp_sender_dry_event) { sctp_stcb_feature_on(inp, stcb, SCTP_PCB_FLAGS_DRYEVNT); } else { sctp_stcb_feature_off(inp, stcb, SCTP_PCB_FLAGS_DRYEVNT); } if (events->sctp_stream_reset_event) { sctp_stcb_feature_on(inp, stcb, SCTP_PCB_FLAGS_STREAM_RESETEVNT); } else { sctp_stcb_feature_off(inp, stcb, SCTP_PCB_FLAGS_STREAM_RESETEVNT); } SCTP_TCB_UNLOCK(stcb); } /* * Send up the sender dry event only for 1-to-1 * style sockets. */ if (events->sctp_sender_dry_event) { if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL)) { stcb = LIST_FIRST(&inp->sctp_asoc_list); if (stcb) { SCTP_TCB_LOCK(stcb); if (TAILQ_EMPTY(&stcb->asoc.send_queue) && TAILQ_EMPTY(&stcb->asoc.sent_queue) && (stcb->asoc.stream_queue_cnt == 0)) { sctp_ulp_notify(SCTP_NOTIFY_SENDER_DRY, stcb, 0, NULL, SCTP_SO_LOCKED); } SCTP_TCB_UNLOCK(stcb); } } } SCTP_INP_RUNLOCK(inp); break; } case SCTP_ADAPTATION_LAYER: { struct sctp_setadaptation *adap_bits; SCTP_CHECK_AND_CAST(adap_bits, optval, struct sctp_setadaptation, optsize); SCTP_INP_WLOCK(inp); inp->sctp_ep.adaptation_layer_indicator = adap_bits->ssb_adaptation_ind; inp->sctp_ep.adaptation_layer_indicator_provided = 1; SCTP_INP_WUNLOCK(inp); break; } #ifdef SCTP_DEBUG case SCTP_SET_INITIAL_DBG_SEQ: { uint32_t *vvv; SCTP_CHECK_AND_CAST(vvv, optval, uint32_t, optsize); SCTP_INP_WLOCK(inp); inp->sctp_ep.initial_sequence_debug = *vvv; SCTP_INP_WUNLOCK(inp); break; } #endif case SCTP_DEFAULT_SEND_PARAM: { struct sctp_sndrcvinfo *s_info; SCTP_CHECK_AND_CAST(s_info, optval, struct sctp_sndrcvinfo, optsize); SCTP_FIND_STCB(inp, stcb, s_info->sinfo_assoc_id); if (stcb) { if (s_info->sinfo_stream < stcb->asoc.streamoutcnt) { memcpy(&stcb->asoc.def_send, s_info, min(optsize, sizeof(stcb->asoc.def_send))); } else { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; } SCTP_TCB_UNLOCK(stcb); } else { if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) || (s_info->sinfo_assoc_id == SCTP_FUTURE_ASSOC) || (s_info->sinfo_assoc_id == SCTP_ALL_ASSOC)) { SCTP_INP_WLOCK(inp); memcpy(&inp->def_send, s_info, min(optsize, sizeof(inp->def_send))); SCTP_INP_WUNLOCK(inp); } if ((s_info->sinfo_assoc_id == SCTP_CURRENT_ASSOC) || (s_info->sinfo_assoc_id == SCTP_ALL_ASSOC)) { SCTP_INP_RLOCK(inp); LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) { SCTP_TCB_LOCK(stcb); if (s_info->sinfo_stream < stcb->asoc.streamoutcnt) { memcpy(&stcb->asoc.def_send, s_info, min(optsize, sizeof(stcb->asoc.def_send))); } SCTP_TCB_UNLOCK(stcb); } SCTP_INP_RUNLOCK(inp); } } break; } case SCTP_PEER_ADDR_PARAMS: { struct sctp_paddrparams *paddrp; struct sctp_nets *net; struct sockaddr *addr; #if defined(INET) && defined(INET6) struct sockaddr_in sin_store; #endif SCTP_CHECK_AND_CAST(paddrp, optval, struct sctp_paddrparams, optsize); SCTP_FIND_STCB(inp, stcb, paddrp->spp_assoc_id); #if defined(INET) && defined(INET6) if (paddrp->spp_address.ss_family == AF_INET6) { struct sockaddr_in6 *sin6; sin6 = (struct sockaddr_in6 *)&paddrp->spp_address; if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { in6_sin6_2_sin(&sin_store, sin6); addr = (struct sockaddr *)&sin_store; } else { addr = (struct sockaddr *)&paddrp->spp_address; } } else { addr = (struct sockaddr *)&paddrp->spp_address; } #else addr = (struct sockaddr *)&paddrp->spp_address; #endif if (stcb != NULL) { net = sctp_findnet(stcb, addr); } else { /* * We increment here since * sctp_findassociation_ep_addr() wil do a * decrement if it finds the stcb as long as * the locked tcb (last argument) is NOT a * TCB.. aka NULL. */ net = NULL; SCTP_INP_INCR_REF(inp); stcb = sctp_findassociation_ep_addr(&inp, addr, &net, NULL, NULL); if (stcb == NULL) { SCTP_INP_DECR_REF(inp); } } if ((stcb != NULL) && (net == NULL)) { #ifdef INET if (addr->sa_family == AF_INET) { struct sockaddr_in *sin; sin = (struct sockaddr_in *)addr; if (sin->sin_addr.s_addr != INADDR_ANY) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); SCTP_TCB_UNLOCK(stcb); error = EINVAL; break; } } else #endif #ifdef INET6 if (addr->sa_family == AF_INET6) { struct sockaddr_in6 *sin6; sin6 = (struct sockaddr_in6 *)addr; if (!IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); SCTP_TCB_UNLOCK(stcb); error = EINVAL; break; } } else #endif { error = EAFNOSUPPORT; SCTP_TCB_UNLOCK(stcb); SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error); break; } } /* sanity checks */ if ((paddrp->spp_flags & SPP_HB_ENABLE) && (paddrp->spp_flags & SPP_HB_DISABLE)) { if (stcb) SCTP_TCB_UNLOCK(stcb); SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); return (EINVAL); } if ((paddrp->spp_flags & SPP_PMTUD_ENABLE) && (paddrp->spp_flags & SPP_PMTUD_DISABLE)) { if (stcb) SCTP_TCB_UNLOCK(stcb); SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); return (EINVAL); } if (stcb != NULL) { /************************TCB SPECIFIC SET ******************/ if (net != NULL) { /************************NET SPECIFIC SET ******************/ if (paddrp->spp_flags & SPP_HB_DISABLE) { if (!(net->dest_state & SCTP_ADDR_UNCONFIRMED) && !(net->dest_state & SCTP_ADDR_NOHB)) { sctp_timer_stop(SCTP_TIMER_TYPE_HEARTBEAT, inp, stcb, net, SCTP_FROM_SCTP_USRREQ + SCTP_LOC_9); } net->dest_state |= SCTP_ADDR_NOHB; } if (paddrp->spp_flags & SPP_HB_ENABLE) { if (paddrp->spp_hbinterval) { net->heart_beat_delay = paddrp->spp_hbinterval; } else if (paddrp->spp_flags & SPP_HB_TIME_IS_ZERO) { net->heart_beat_delay = 0; } sctp_timer_stop(SCTP_TIMER_TYPE_HEARTBEAT, inp, stcb, net, SCTP_FROM_SCTP_USRREQ + SCTP_LOC_10); sctp_timer_start(SCTP_TIMER_TYPE_HEARTBEAT, inp, stcb, net); net->dest_state &= ~SCTP_ADDR_NOHB; } if (paddrp->spp_flags & SPP_HB_DEMAND) { /* on demand HB */ sctp_send_hb(stcb, net, SCTP_SO_LOCKED); sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_SOCKOPT, SCTP_SO_LOCKED); sctp_timer_start(SCTP_TIMER_TYPE_HEARTBEAT, inp, stcb, net); } if ((paddrp->spp_flags & SPP_PMTUD_DISABLE) && (paddrp->spp_pathmtu >= SCTP_SMALLEST_PMTU)) { if (SCTP_OS_TIMER_PENDING(&net->pmtu_timer.timer)) { sctp_timer_stop(SCTP_TIMER_TYPE_PATHMTURAISE, inp, stcb, net, SCTP_FROM_SCTP_USRREQ + SCTP_LOC_11); } net->dest_state |= SCTP_ADDR_NO_PMTUD; net->mtu = paddrp->spp_pathmtu; switch (net->ro._l_addr.sa.sa_family) { #ifdef INET case AF_INET: net->mtu += SCTP_MIN_V4_OVERHEAD; break; #endif #ifdef INET6 case AF_INET6: net->mtu += SCTP_MIN_OVERHEAD; break; #endif default: break; } if (net->mtu < stcb->asoc.smallest_mtu) { sctp_pathmtu_adjustment(stcb, net->mtu); } } if (paddrp->spp_flags & SPP_PMTUD_ENABLE) { if (!SCTP_OS_TIMER_PENDING(&net->pmtu_timer.timer)) { sctp_timer_start(SCTP_TIMER_TYPE_PATHMTURAISE, inp, stcb, net); } net->dest_state &= ~SCTP_ADDR_NO_PMTUD; } if (paddrp->spp_pathmaxrxt) { if (net->dest_state & SCTP_ADDR_PF) { if (net->error_count > paddrp->spp_pathmaxrxt) { net->dest_state &= ~SCTP_ADDR_PF; } } else { if ((net->error_count <= paddrp->spp_pathmaxrxt) && (net->error_count > net->pf_threshold)) { net->dest_state |= SCTP_ADDR_PF; sctp_send_hb(stcb, net, SCTP_SO_LOCKED); sctp_timer_stop(SCTP_TIMER_TYPE_HEARTBEAT, stcb->sctp_ep, stcb, net, SCTP_FROM_SCTP_USRREQ + SCTP_LOC_12); sctp_timer_start(SCTP_TIMER_TYPE_HEARTBEAT, stcb->sctp_ep, stcb, net); } } if (net->dest_state & SCTP_ADDR_REACHABLE) { if (net->error_count > paddrp->spp_pathmaxrxt) { net->dest_state &= ~SCTP_ADDR_REACHABLE; sctp_ulp_notify(SCTP_NOTIFY_INTERFACE_DOWN, stcb, 0, net, SCTP_SO_LOCKED); } } else { if (net->error_count <= paddrp->spp_pathmaxrxt) { net->dest_state |= SCTP_ADDR_REACHABLE; sctp_ulp_notify(SCTP_NOTIFY_INTERFACE_UP, stcb, 0, net, SCTP_SO_LOCKED); } } net->failure_threshold = paddrp->spp_pathmaxrxt; } if (paddrp->spp_flags & SPP_DSCP) { net->dscp = paddrp->spp_dscp & 0xfc; net->dscp |= 0x01; } #ifdef INET6 if (paddrp->spp_flags & SPP_IPV6_FLOWLABEL) { if (net->ro._l_addr.sa.sa_family == AF_INET6) { net->flowlabel = paddrp->spp_ipv6_flowlabel & 0x000fffff; net->flowlabel |= 0x80000000; } } #endif } else { /************************ASSOC ONLY -- NO NET SPECIFIC SET ******************/ if (paddrp->spp_pathmaxrxt != 0) { stcb->asoc.def_net_failure = paddrp->spp_pathmaxrxt; TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) { if (net->dest_state & SCTP_ADDR_PF) { if (net->error_count > paddrp->spp_pathmaxrxt) { net->dest_state &= ~SCTP_ADDR_PF; } } else { if ((net->error_count <= paddrp->spp_pathmaxrxt) && (net->error_count > net->pf_threshold)) { net->dest_state |= SCTP_ADDR_PF; sctp_send_hb(stcb, net, SCTP_SO_LOCKED); sctp_timer_stop(SCTP_TIMER_TYPE_HEARTBEAT, stcb->sctp_ep, stcb, net, SCTP_FROM_SCTP_USRREQ + SCTP_LOC_13); sctp_timer_start(SCTP_TIMER_TYPE_HEARTBEAT, stcb->sctp_ep, stcb, net); } } if (net->dest_state & SCTP_ADDR_REACHABLE) { if (net->error_count > paddrp->spp_pathmaxrxt) { net->dest_state &= ~SCTP_ADDR_REACHABLE; sctp_ulp_notify(SCTP_NOTIFY_INTERFACE_DOWN, stcb, 0, net, SCTP_SO_LOCKED); } } else { if (net->error_count <= paddrp->spp_pathmaxrxt) { net->dest_state |= SCTP_ADDR_REACHABLE; sctp_ulp_notify(SCTP_NOTIFY_INTERFACE_UP, stcb, 0, net, SCTP_SO_LOCKED); } } net->failure_threshold = paddrp->spp_pathmaxrxt; } } if (paddrp->spp_flags & SPP_HB_ENABLE) { if (paddrp->spp_hbinterval != 0) { stcb->asoc.heart_beat_delay = paddrp->spp_hbinterval; } else if (paddrp->spp_flags & SPP_HB_TIME_IS_ZERO) { stcb->asoc.heart_beat_delay = 0; } /* Turn back on the timer */ TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) { if (paddrp->spp_hbinterval != 0) { net->heart_beat_delay = paddrp->spp_hbinterval; } else if (paddrp->spp_flags & SPP_HB_TIME_IS_ZERO) { net->heart_beat_delay = 0; } if (net->dest_state & SCTP_ADDR_NOHB) { net->dest_state &= ~SCTP_ADDR_NOHB; } sctp_timer_stop(SCTP_TIMER_TYPE_HEARTBEAT, inp, stcb, net, SCTP_FROM_SCTP_USRREQ + SCTP_LOC_14); sctp_timer_start(SCTP_TIMER_TYPE_HEARTBEAT, inp, stcb, net); } sctp_stcb_feature_off(inp, stcb, SCTP_PCB_FLAGS_DONOT_HEARTBEAT); } if (paddrp->spp_flags & SPP_HB_DISABLE) { TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) { if (!(net->dest_state & SCTP_ADDR_NOHB)) { net->dest_state |= SCTP_ADDR_NOHB; if (!(net->dest_state & SCTP_ADDR_UNCONFIRMED)) { sctp_timer_stop(SCTP_TIMER_TYPE_HEARTBEAT, inp, stcb, net, SCTP_FROM_SCTP_USRREQ + SCTP_LOC_15); } } } sctp_stcb_feature_on(inp, stcb, SCTP_PCB_FLAGS_DONOT_HEARTBEAT); } if ((paddrp->spp_flags & SPP_PMTUD_DISABLE) && (paddrp->spp_pathmtu >= SCTP_SMALLEST_PMTU)) { TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) { if (SCTP_OS_TIMER_PENDING(&net->pmtu_timer.timer)) { sctp_timer_stop(SCTP_TIMER_TYPE_PATHMTURAISE, inp, stcb, net, SCTP_FROM_SCTP_USRREQ + SCTP_LOC_16); } net->dest_state |= SCTP_ADDR_NO_PMTUD; net->mtu = paddrp->spp_pathmtu; switch (net->ro._l_addr.sa.sa_family) { #ifdef INET case AF_INET: net->mtu += SCTP_MIN_V4_OVERHEAD; break; #endif #ifdef INET6 case AF_INET6: net->mtu += SCTP_MIN_OVERHEAD; break; #endif default: break; } if (net->mtu < stcb->asoc.smallest_mtu) { sctp_pathmtu_adjustment(stcb, net->mtu); } } sctp_stcb_feature_on(inp, stcb, SCTP_PCB_FLAGS_DO_NOT_PMTUD); } if (paddrp->spp_flags & SPP_PMTUD_ENABLE) { TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) { if (!SCTP_OS_TIMER_PENDING(&net->pmtu_timer.timer)) { sctp_timer_start(SCTP_TIMER_TYPE_PATHMTURAISE, inp, stcb, net); } net->dest_state &= ~SCTP_ADDR_NO_PMTUD; } sctp_stcb_feature_off(inp, stcb, SCTP_PCB_FLAGS_DO_NOT_PMTUD); } if (paddrp->spp_flags & SPP_DSCP) { TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) { net->dscp = paddrp->spp_dscp & 0xfc; net->dscp |= 0x01; } stcb->asoc.default_dscp = paddrp->spp_dscp & 0xfc; stcb->asoc.default_dscp |= 0x01; } #ifdef INET6 if (paddrp->spp_flags & SPP_IPV6_FLOWLABEL) { TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) { if (net->ro._l_addr.sa.sa_family == AF_INET6) { net->flowlabel = paddrp->spp_ipv6_flowlabel & 0x000fffff; net->flowlabel |= 0x80000000; } } stcb->asoc.default_flowlabel = paddrp->spp_ipv6_flowlabel & 0x000fffff; stcb->asoc.default_flowlabel |= 0x80000000; } #endif } SCTP_TCB_UNLOCK(stcb); } else { /************************NO TCB, SET TO default stuff ******************/ if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) || (paddrp->spp_assoc_id == SCTP_FUTURE_ASSOC)) { SCTP_INP_WLOCK(inp); /* * For the TOS/FLOWLABEL stuff you * set it with the options on the * socket */ if (paddrp->spp_pathmaxrxt != 0) { inp->sctp_ep.def_net_failure = paddrp->spp_pathmaxrxt; } if (paddrp->spp_flags & SPP_HB_TIME_IS_ZERO) inp->sctp_ep.sctp_timeoutticks[SCTP_TIMER_HEARTBEAT] = 0; else if (paddrp->spp_hbinterval != 0) { if (paddrp->spp_hbinterval > SCTP_MAX_HB_INTERVAL) paddrp->spp_hbinterval = SCTP_MAX_HB_INTERVAL; inp->sctp_ep.sctp_timeoutticks[SCTP_TIMER_HEARTBEAT] = MSEC_TO_TICKS(paddrp->spp_hbinterval); } if (paddrp->spp_flags & SPP_HB_ENABLE) { if (paddrp->spp_flags & SPP_HB_TIME_IS_ZERO) { inp->sctp_ep.sctp_timeoutticks[SCTP_TIMER_HEARTBEAT] = 0; } else if (paddrp->spp_hbinterval) { inp->sctp_ep.sctp_timeoutticks[SCTP_TIMER_HEARTBEAT] = MSEC_TO_TICKS(paddrp->spp_hbinterval); } sctp_feature_off(inp, SCTP_PCB_FLAGS_DONOT_HEARTBEAT); } else if (paddrp->spp_flags & SPP_HB_DISABLE) { sctp_feature_on(inp, SCTP_PCB_FLAGS_DONOT_HEARTBEAT); } if (paddrp->spp_flags & SPP_PMTUD_ENABLE) { sctp_feature_off(inp, SCTP_PCB_FLAGS_DO_NOT_PMTUD); } else if (paddrp->spp_flags & SPP_PMTUD_DISABLE) { sctp_feature_on(inp, SCTP_PCB_FLAGS_DO_NOT_PMTUD); } if (paddrp->spp_flags & SPP_DSCP) { inp->sctp_ep.default_dscp = paddrp->spp_dscp & 0xfc; inp->sctp_ep.default_dscp |= 0x01; } #ifdef INET6 if (paddrp->spp_flags & SPP_IPV6_FLOWLABEL) { if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) { inp->sctp_ep.default_flowlabel = paddrp->spp_ipv6_flowlabel & 0x000fffff; inp->sctp_ep.default_flowlabel |= 0x80000000; } } #endif SCTP_INP_WUNLOCK(inp); } else { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; } } break; } case SCTP_RTOINFO: { struct sctp_rtoinfo *srto; uint32_t new_init, new_min, new_max; SCTP_CHECK_AND_CAST(srto, optval, struct sctp_rtoinfo, optsize); SCTP_FIND_STCB(inp, stcb, srto->srto_assoc_id); if (stcb) { if (srto->srto_initial) new_init = srto->srto_initial; else new_init = stcb->asoc.initial_rto; if (srto->srto_max) new_max = srto->srto_max; else new_max = stcb->asoc.maxrto; if (srto->srto_min) new_min = srto->srto_min; else new_min = stcb->asoc.minrto; if ((new_min <= new_init) && (new_init <= new_max)) { stcb->asoc.initial_rto = new_init; stcb->asoc.maxrto = new_max; stcb->asoc.minrto = new_min; } else { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; } SCTP_TCB_UNLOCK(stcb); } else { if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) || (srto->srto_assoc_id == SCTP_FUTURE_ASSOC)) { SCTP_INP_WLOCK(inp); if (srto->srto_initial) new_init = srto->srto_initial; else new_init = inp->sctp_ep.initial_rto; if (srto->srto_max) new_max = srto->srto_max; else new_max = inp->sctp_ep.sctp_maxrto; if (srto->srto_min) new_min = srto->srto_min; else new_min = inp->sctp_ep.sctp_minrto; if ((new_min <= new_init) && (new_init <= new_max)) { inp->sctp_ep.initial_rto = new_init; inp->sctp_ep.sctp_maxrto = new_max; inp->sctp_ep.sctp_minrto = new_min; } else { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; } SCTP_INP_WUNLOCK(inp); } else { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; } } break; } case SCTP_ASSOCINFO: { struct sctp_assocparams *sasoc; SCTP_CHECK_AND_CAST(sasoc, optval, struct sctp_assocparams, optsize); SCTP_FIND_STCB(inp, stcb, sasoc->sasoc_assoc_id); if (sasoc->sasoc_cookie_life) { /* boundary check the cookie life */ if (sasoc->sasoc_cookie_life < 1000) sasoc->sasoc_cookie_life = 1000; if (sasoc->sasoc_cookie_life > SCTP_MAX_COOKIE_LIFE) { sasoc->sasoc_cookie_life = SCTP_MAX_COOKIE_LIFE; } } if (stcb) { if (sasoc->sasoc_asocmaxrxt) stcb->asoc.max_send_times = sasoc->sasoc_asocmaxrxt; if (sasoc->sasoc_cookie_life) { stcb->asoc.cookie_life = MSEC_TO_TICKS(sasoc->sasoc_cookie_life); } SCTP_TCB_UNLOCK(stcb); } else { if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) || (sasoc->sasoc_assoc_id == SCTP_FUTURE_ASSOC)) { SCTP_INP_WLOCK(inp); if (sasoc->sasoc_asocmaxrxt) inp->sctp_ep.max_send_times = sasoc->sasoc_asocmaxrxt; if (sasoc->sasoc_cookie_life) { inp->sctp_ep.def_cookie_life = MSEC_TO_TICKS(sasoc->sasoc_cookie_life); } SCTP_INP_WUNLOCK(inp); } else { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; } } break; } case SCTP_INITMSG: { struct sctp_initmsg *sinit; SCTP_CHECK_AND_CAST(sinit, optval, struct sctp_initmsg, optsize); SCTP_INP_WLOCK(inp); if (sinit->sinit_num_ostreams) inp->sctp_ep.pre_open_stream_count = sinit->sinit_num_ostreams; if (sinit->sinit_max_instreams) inp->sctp_ep.max_open_streams_intome = sinit->sinit_max_instreams; if (sinit->sinit_max_attempts) inp->sctp_ep.max_init_times = sinit->sinit_max_attempts; if (sinit->sinit_max_init_timeo) inp->sctp_ep.initial_init_rto_max = sinit->sinit_max_init_timeo; SCTP_INP_WUNLOCK(inp); break; } case SCTP_PRIMARY_ADDR: { struct sctp_setprim *spa; struct sctp_nets *net; struct sockaddr *addr; #if defined(INET) && defined(INET6) struct sockaddr_in sin_store; #endif SCTP_CHECK_AND_CAST(spa, optval, struct sctp_setprim, optsize); SCTP_FIND_STCB(inp, stcb, spa->ssp_assoc_id); #if defined(INET) && defined(INET6) if (spa->ssp_addr.ss_family == AF_INET6) { struct sockaddr_in6 *sin6; sin6 = (struct sockaddr_in6 *)&spa->ssp_addr; if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { in6_sin6_2_sin(&sin_store, sin6); addr = (struct sockaddr *)&sin_store; } else { addr = (struct sockaddr *)&spa->ssp_addr; } } else { addr = (struct sockaddr *)&spa->ssp_addr; } #else addr = (struct sockaddr *)&spa->ssp_addr; #endif if (stcb != NULL) { net = sctp_findnet(stcb, addr); } else { /* * We increment here since * sctp_findassociation_ep_addr() wil do a * decrement if it finds the stcb as long as * the locked tcb (last argument) is NOT a * TCB.. aka NULL. */ net = NULL; SCTP_INP_INCR_REF(inp); stcb = sctp_findassociation_ep_addr(&inp, addr, &net, NULL, NULL); if (stcb == NULL) { SCTP_INP_DECR_REF(inp); } } if ((stcb != NULL) && (net != NULL)) { if (net != stcb->asoc.primary_destination) { if (!(net->dest_state & SCTP_ADDR_UNCONFIRMED)) { /* Ok we need to set it */ if (sctp_set_primary_addr(stcb, (struct sockaddr *)NULL, net) == 0) { if ((stcb->asoc.alternate) && (!(net->dest_state & SCTP_ADDR_PF)) && (net->dest_state & SCTP_ADDR_REACHABLE)) { sctp_free_remote_addr(stcb->asoc.alternate); stcb->asoc.alternate = NULL; } } else { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; } } else { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; } } } else { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; } if (stcb != NULL) { SCTP_TCB_UNLOCK(stcb); } break; } case SCTP_SET_DYNAMIC_PRIMARY: { union sctp_sockstore *ss; error = priv_check(curthread, PRIV_NETINET_RESERVEDPORT); if (error) break; SCTP_CHECK_AND_CAST(ss, optval, union sctp_sockstore, optsize); /* SUPER USER CHECK? */ error = sctp_dynamic_set_primary(&ss->sa, vrf_id); break; } case SCTP_SET_PEER_PRIMARY_ADDR: { struct sctp_setpeerprim *sspp; struct sockaddr *addr; #if defined(INET) && defined(INET6) struct sockaddr_in sin_store; #endif SCTP_CHECK_AND_CAST(sspp, optval, struct sctp_setpeerprim, optsize); SCTP_FIND_STCB(inp, stcb, sspp->sspp_assoc_id); if (stcb != NULL) { struct sctp_ifa *ifa; #if defined(INET) && defined(INET6) if (sspp->sspp_addr.ss_family == AF_INET6) { struct sockaddr_in6 *sin6; sin6 = (struct sockaddr_in6 *)&sspp->sspp_addr; if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { in6_sin6_2_sin(&sin_store, sin6); addr = (struct sockaddr *)&sin_store; } else { addr = (struct sockaddr *)&sspp->sspp_addr; } } else { addr = (struct sockaddr *)&sspp->sspp_addr; } #else addr = (struct sockaddr *)&sspp->sspp_addr; #endif ifa = sctp_find_ifa_by_addr(addr, stcb->asoc.vrf_id, SCTP_ADDR_NOT_LOCKED); if (ifa == NULL) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; goto out_of_it; } if ((inp->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) == 0) { /* * Must validate the ifa found is in * our ep */ struct sctp_laddr *laddr; int found = 0; LIST_FOREACH(laddr, &inp->sctp_addr_list, sctp_nxt_addr) { if (laddr->ifa == NULL) { SCTPDBG(SCTP_DEBUG_OUTPUT1, "%s: NULL ifa\n", __func__); continue; } if ((sctp_is_addr_restricted(stcb, laddr->ifa)) && (!sctp_is_addr_pending(stcb, laddr->ifa))) { continue; } if (laddr->ifa == ifa) { found = 1; break; } } if (!found) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; goto out_of_it; } } else { switch (addr->sa_family) { #ifdef INET case AF_INET: { struct sockaddr_in *sin; sin = (struct sockaddr_in *)addr; if (prison_check_ip4(inp->ip_inp.inp.inp_cred, &sin->sin_addr) != 0) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; goto out_of_it; } break; } #endif #ifdef INET6 case AF_INET6: { struct sockaddr_in6 *sin6; sin6 = (struct sockaddr_in6 *)addr; if (prison_check_ip6(inp->ip_inp.inp.inp_cred, &sin6->sin6_addr) != 0) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; goto out_of_it; } break; } #endif default: SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; goto out_of_it; } } if (sctp_set_primary_ip_address_sa(stcb, addr) != 0) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; } sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_SOCKOPT, SCTP_SO_LOCKED); out_of_it: SCTP_TCB_UNLOCK(stcb); } else { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; } break; } case SCTP_BINDX_ADD_ADDR: { struct sctp_getaddresses *addrs; struct thread *td; td = (struct thread *)p; SCTP_CHECK_AND_CAST(addrs, optval, struct sctp_getaddresses, optsize); #ifdef INET if (addrs->addr->sa_family == AF_INET) { if (optsize < sizeof(struct sctp_getaddresses) - sizeof(struct sockaddr) + sizeof(struct sockaddr_in)) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; break; } if (td != NULL && (error = prison_local_ip4(td->td_ucred, &(((struct sockaddr_in *)(addrs->addr))->sin_addr)))) { SCTP_LTRACE_ERR_RET(inp, stcb, NULL, SCTP_FROM_SCTP_USRREQ, error); break; } } else #endif #ifdef INET6 if (addrs->addr->sa_family == AF_INET6) { if (optsize < sizeof(struct sctp_getaddresses) - sizeof(struct sockaddr) + sizeof(struct sockaddr_in6)) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; break; } if (td != NULL && (error = prison_local_ip6(td->td_ucred, &(((struct sockaddr_in6 *)(addrs->addr))->sin6_addr), (SCTP_IPV6_V6ONLY(inp) != 0))) != 0) { SCTP_LTRACE_ERR_RET(inp, stcb, NULL, SCTP_FROM_SCTP_USRREQ, error); break; } } else #endif { error = EAFNOSUPPORT; break; } sctp_bindx_add_address(so, inp, addrs->addr, addrs->sget_assoc_id, vrf_id, &error, p); break; } case SCTP_BINDX_REM_ADDR: { struct sctp_getaddresses *addrs; struct thread *td; td = (struct thread *)p; SCTP_CHECK_AND_CAST(addrs, optval, struct sctp_getaddresses, optsize); #ifdef INET if (addrs->addr->sa_family == AF_INET) { if (optsize < sizeof(struct sctp_getaddresses) - sizeof(struct sockaddr) + sizeof(struct sockaddr_in)) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; break; } if (td != NULL && (error = prison_local_ip4(td->td_ucred, &(((struct sockaddr_in *)(addrs->addr))->sin_addr)))) { SCTP_LTRACE_ERR_RET(inp, stcb, NULL, SCTP_FROM_SCTP_USRREQ, error); break; } } else #endif #ifdef INET6 if (addrs->addr->sa_family == AF_INET6) { if (optsize < sizeof(struct sctp_getaddresses) - sizeof(struct sockaddr) + sizeof(struct sockaddr_in6)) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; break; } if (td != NULL && (error = prison_local_ip6(td->td_ucred, &(((struct sockaddr_in6 *)(addrs->addr))->sin6_addr), (SCTP_IPV6_V6ONLY(inp) != 0))) != 0) { SCTP_LTRACE_ERR_RET(inp, stcb, NULL, SCTP_FROM_SCTP_USRREQ, error); break; } } else #endif { error = EAFNOSUPPORT; break; } sctp_bindx_delete_address(inp, addrs->addr, addrs->sget_assoc_id, vrf_id, &error); break; } case SCTP_EVENT: { struct sctp_event *event; uint32_t event_type; SCTP_CHECK_AND_CAST(event, optval, struct sctp_event, optsize); SCTP_FIND_STCB(inp, stcb, event->se_assoc_id); switch (event->se_type) { case SCTP_ASSOC_CHANGE: event_type = SCTP_PCB_FLAGS_RECVASSOCEVNT; break; case SCTP_PEER_ADDR_CHANGE: event_type = SCTP_PCB_FLAGS_RECVPADDREVNT; break; case SCTP_REMOTE_ERROR: event_type = SCTP_PCB_FLAGS_RECVPEERERR; break; case SCTP_SEND_FAILED: event_type = SCTP_PCB_FLAGS_RECVSENDFAILEVNT; break; case SCTP_SHUTDOWN_EVENT: event_type = SCTP_PCB_FLAGS_RECVSHUTDOWNEVNT; break; case SCTP_ADAPTATION_INDICATION: event_type = SCTP_PCB_FLAGS_ADAPTATIONEVNT; break; case SCTP_PARTIAL_DELIVERY_EVENT: event_type = SCTP_PCB_FLAGS_PDAPIEVNT; break; case SCTP_AUTHENTICATION_EVENT: event_type = SCTP_PCB_FLAGS_AUTHEVNT; break; case SCTP_STREAM_RESET_EVENT: event_type = SCTP_PCB_FLAGS_STREAM_RESETEVNT; break; case SCTP_SENDER_DRY_EVENT: event_type = SCTP_PCB_FLAGS_DRYEVNT; break; case SCTP_NOTIFICATIONS_STOPPED_EVENT: event_type = 0; SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOTSUP); error = ENOTSUP; break; case SCTP_ASSOC_RESET_EVENT: event_type = SCTP_PCB_FLAGS_ASSOC_RESETEVNT; break; case SCTP_STREAM_CHANGE_EVENT: event_type = SCTP_PCB_FLAGS_STREAM_CHANGEEVNT; break; case SCTP_SEND_FAILED_EVENT: event_type = SCTP_PCB_FLAGS_RECVNSENDFAILEVNT; break; default: event_type = 0; SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; break; } if (event_type > 0) { if (stcb) { if (event->se_on) { sctp_stcb_feature_on(inp, stcb, event_type); if (event_type == SCTP_PCB_FLAGS_DRYEVNT) { if (TAILQ_EMPTY(&stcb->asoc.send_queue) && TAILQ_EMPTY(&stcb->asoc.sent_queue) && (stcb->asoc.stream_queue_cnt == 0)) { sctp_ulp_notify(SCTP_NOTIFY_SENDER_DRY, stcb, 0, NULL, SCTP_SO_LOCKED); } } } else { sctp_stcb_feature_off(inp, stcb, event_type); } SCTP_TCB_UNLOCK(stcb); } else { /* * We don't want to send up a storm * of events, so return an error for * sender dry events */ if ((event_type == SCTP_PCB_FLAGS_DRYEVNT) && ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) == 0) && ((inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) == 0) && ((event->se_assoc_id == SCTP_ALL_ASSOC) || (event->se_assoc_id == SCTP_CURRENT_ASSOC))) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOTSUP); error = ENOTSUP; break; } if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) || (event->se_assoc_id == SCTP_FUTURE_ASSOC) || (event->se_assoc_id == SCTP_ALL_ASSOC)) { SCTP_INP_WLOCK(inp); if (event->se_on) { sctp_feature_on(inp, event_type); } else { sctp_feature_off(inp, event_type); } SCTP_INP_WUNLOCK(inp); } if ((event->se_assoc_id == SCTP_CURRENT_ASSOC) || (event->se_assoc_id == SCTP_ALL_ASSOC)) { SCTP_INP_RLOCK(inp); LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) { SCTP_TCB_LOCK(stcb); if (event->se_on) { sctp_stcb_feature_on(inp, stcb, event_type); } else { sctp_stcb_feature_off(inp, stcb, event_type); } SCTP_TCB_UNLOCK(stcb); } SCTP_INP_RUNLOCK(inp); } } } break; } case SCTP_RECVRCVINFO: { int *onoff; SCTP_CHECK_AND_CAST(onoff, optval, int, optsize); SCTP_INP_WLOCK(inp); if (*onoff != 0) { sctp_feature_on(inp, SCTP_PCB_FLAGS_RECVRCVINFO); } else { sctp_feature_off(inp, SCTP_PCB_FLAGS_RECVRCVINFO); } SCTP_INP_WUNLOCK(inp); break; } case SCTP_RECVNXTINFO: { int *onoff; SCTP_CHECK_AND_CAST(onoff, optval, int, optsize); SCTP_INP_WLOCK(inp); if (*onoff != 0) { sctp_feature_on(inp, SCTP_PCB_FLAGS_RECVNXTINFO); } else { sctp_feature_off(inp, SCTP_PCB_FLAGS_RECVNXTINFO); } SCTP_INP_WUNLOCK(inp); break; } case SCTP_DEFAULT_SNDINFO: { struct sctp_sndinfo *info; uint16_t policy; SCTP_CHECK_AND_CAST(info, optval, struct sctp_sndinfo, optsize); SCTP_FIND_STCB(inp, stcb, info->snd_assoc_id); if (stcb) { if (info->snd_sid < stcb->asoc.streamoutcnt) { stcb->asoc.def_send.sinfo_stream = info->snd_sid; policy = PR_SCTP_POLICY(stcb->asoc.def_send.sinfo_flags); stcb->asoc.def_send.sinfo_flags = info->snd_flags; stcb->asoc.def_send.sinfo_flags |= policy; stcb->asoc.def_send.sinfo_ppid = info->snd_ppid; stcb->asoc.def_send.sinfo_context = info->snd_context; } else { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; } SCTP_TCB_UNLOCK(stcb); } else { if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) || (info->snd_assoc_id == SCTP_FUTURE_ASSOC) || (info->snd_assoc_id == SCTP_ALL_ASSOC)) { SCTP_INP_WLOCK(inp); inp->def_send.sinfo_stream = info->snd_sid; policy = PR_SCTP_POLICY(inp->def_send.sinfo_flags); inp->def_send.sinfo_flags = info->snd_flags; inp->def_send.sinfo_flags |= policy; inp->def_send.sinfo_ppid = info->snd_ppid; inp->def_send.sinfo_context = info->snd_context; SCTP_INP_WUNLOCK(inp); } if ((info->snd_assoc_id == SCTP_CURRENT_ASSOC) || (info->snd_assoc_id == SCTP_ALL_ASSOC)) { SCTP_INP_RLOCK(inp); LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) { SCTP_TCB_LOCK(stcb); if (info->snd_sid < stcb->asoc.streamoutcnt) { stcb->asoc.def_send.sinfo_stream = info->snd_sid; policy = PR_SCTP_POLICY(stcb->asoc.def_send.sinfo_flags); stcb->asoc.def_send.sinfo_flags = info->snd_flags; stcb->asoc.def_send.sinfo_flags |= policy; stcb->asoc.def_send.sinfo_ppid = info->snd_ppid; stcb->asoc.def_send.sinfo_context = info->snd_context; } SCTP_TCB_UNLOCK(stcb); } SCTP_INP_RUNLOCK(inp); } } break; } case SCTP_DEFAULT_PRINFO: { struct sctp_default_prinfo *info; SCTP_CHECK_AND_CAST(info, optval, struct sctp_default_prinfo, optsize); SCTP_FIND_STCB(inp, stcb, info->pr_assoc_id); if (info->pr_policy > SCTP_PR_SCTP_MAX) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; break; } if (stcb) { stcb->asoc.def_send.sinfo_flags &= 0xfff0; stcb->asoc.def_send.sinfo_flags |= info->pr_policy; stcb->asoc.def_send.sinfo_timetolive = info->pr_value; SCTP_TCB_UNLOCK(stcb); } else { if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) || (info->pr_assoc_id == SCTP_FUTURE_ASSOC) || (info->pr_assoc_id == SCTP_ALL_ASSOC)) { SCTP_INP_WLOCK(inp); inp->def_send.sinfo_flags &= 0xfff0; inp->def_send.sinfo_flags |= info->pr_policy; inp->def_send.sinfo_timetolive = info->pr_value; SCTP_INP_WUNLOCK(inp); } if ((info->pr_assoc_id == SCTP_CURRENT_ASSOC) || (info->pr_assoc_id == SCTP_ALL_ASSOC)) { SCTP_INP_RLOCK(inp); LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) { SCTP_TCB_LOCK(stcb); stcb->asoc.def_send.sinfo_flags &= 0xfff0; stcb->asoc.def_send.sinfo_flags |= info->pr_policy; stcb->asoc.def_send.sinfo_timetolive = info->pr_value; SCTP_TCB_UNLOCK(stcb); } SCTP_INP_RUNLOCK(inp); } } break; } case SCTP_PEER_ADDR_THLDS: /* Applies to the specific association */ { struct sctp_paddrthlds *thlds; struct sctp_nets *net; struct sockaddr *addr; #if defined(INET) && defined(INET6) struct sockaddr_in sin_store; #endif SCTP_CHECK_AND_CAST(thlds, optval, struct sctp_paddrthlds, optsize); SCTP_FIND_STCB(inp, stcb, thlds->spt_assoc_id); #if defined(INET) && defined(INET6) if (thlds->spt_address.ss_family == AF_INET6) { struct sockaddr_in6 *sin6; sin6 = (struct sockaddr_in6 *)&thlds->spt_address; if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { in6_sin6_2_sin(&sin_store, sin6); addr = (struct sockaddr *)&sin_store; } else { addr = (struct sockaddr *)&thlds->spt_address; } } else { addr = (struct sockaddr *)&thlds->spt_address; } #else addr = (struct sockaddr *)&thlds->spt_address; #endif if (stcb != NULL) { net = sctp_findnet(stcb, addr); } else { /* * We increment here since * sctp_findassociation_ep_addr() wil do a * decrement if it finds the stcb as long as * the locked tcb (last argument) is NOT a * TCB.. aka NULL. */ net = NULL; SCTP_INP_INCR_REF(inp); stcb = sctp_findassociation_ep_addr(&inp, addr, &net, NULL, NULL); if (stcb == NULL) { SCTP_INP_DECR_REF(inp); } } if ((stcb != NULL) && (net == NULL)) { #ifdef INET if (addr->sa_family == AF_INET) { struct sockaddr_in *sin; sin = (struct sockaddr_in *)addr; if (sin->sin_addr.s_addr != INADDR_ANY) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); SCTP_TCB_UNLOCK(stcb); error = EINVAL; break; } } else #endif #ifdef INET6 if (addr->sa_family == AF_INET6) { struct sockaddr_in6 *sin6; sin6 = (struct sockaddr_in6 *)addr; if (!IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); SCTP_TCB_UNLOCK(stcb); error = EINVAL; break; } } else #endif { error = EAFNOSUPPORT; SCTP_TCB_UNLOCK(stcb); SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error); break; } } if (thlds->spt_pathcpthld != 0xffff) { error = EINVAL; SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error); break; } if (stcb != NULL) { if (net != NULL) { net->failure_threshold = thlds->spt_pathmaxrxt; net->pf_threshold = thlds->spt_pathpfthld; if (net->dest_state & SCTP_ADDR_PF) { if ((net->error_count > net->failure_threshold) || (net->error_count <= net->pf_threshold)) { net->dest_state &= ~SCTP_ADDR_PF; } } else { if ((net->error_count > net->pf_threshold) && (net->error_count <= net->failure_threshold)) { net->dest_state |= SCTP_ADDR_PF; sctp_send_hb(stcb, net, SCTP_SO_LOCKED); sctp_timer_stop(SCTP_TIMER_TYPE_HEARTBEAT, stcb->sctp_ep, stcb, net, SCTP_FROM_SCTP_USRREQ + SCTP_LOC_17); sctp_timer_start(SCTP_TIMER_TYPE_HEARTBEAT, stcb->sctp_ep, stcb, net); } } if (net->dest_state & SCTP_ADDR_REACHABLE) { if (net->error_count > net->failure_threshold) { net->dest_state &= ~SCTP_ADDR_REACHABLE; sctp_ulp_notify(SCTP_NOTIFY_INTERFACE_DOWN, stcb, 0, net, SCTP_SO_LOCKED); } } else { if (net->error_count <= net->failure_threshold) { net->dest_state |= SCTP_ADDR_REACHABLE; sctp_ulp_notify(SCTP_NOTIFY_INTERFACE_UP, stcb, 0, net, SCTP_SO_LOCKED); } } } else { TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) { net->failure_threshold = thlds->spt_pathmaxrxt; net->pf_threshold = thlds->spt_pathpfthld; if (net->dest_state & SCTP_ADDR_PF) { if ((net->error_count > net->failure_threshold) || (net->error_count <= net->pf_threshold)) { net->dest_state &= ~SCTP_ADDR_PF; } } else { if ((net->error_count > net->pf_threshold) && (net->error_count <= net->failure_threshold)) { net->dest_state |= SCTP_ADDR_PF; sctp_send_hb(stcb, net, SCTP_SO_LOCKED); sctp_timer_stop(SCTP_TIMER_TYPE_HEARTBEAT, stcb->sctp_ep, stcb, net, SCTP_FROM_SCTP_USRREQ + SCTP_LOC_18); sctp_timer_start(SCTP_TIMER_TYPE_HEARTBEAT, stcb->sctp_ep, stcb, net); } } if (net->dest_state & SCTP_ADDR_REACHABLE) { if (net->error_count > net->failure_threshold) { net->dest_state &= ~SCTP_ADDR_REACHABLE; sctp_ulp_notify(SCTP_NOTIFY_INTERFACE_DOWN, stcb, 0, net, SCTP_SO_LOCKED); } } else { if (net->error_count <= net->failure_threshold) { net->dest_state |= SCTP_ADDR_REACHABLE; sctp_ulp_notify(SCTP_NOTIFY_INTERFACE_UP, stcb, 0, net, SCTP_SO_LOCKED); } } } stcb->asoc.def_net_failure = thlds->spt_pathmaxrxt; stcb->asoc.def_net_pf_threshold = thlds->spt_pathpfthld; } SCTP_TCB_UNLOCK(stcb); } else { if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) || (thlds->spt_assoc_id == SCTP_FUTURE_ASSOC)) { SCTP_INP_WLOCK(inp); inp->sctp_ep.def_net_failure = thlds->spt_pathmaxrxt; inp->sctp_ep.def_net_pf_threshold = thlds->spt_pathpfthld; SCTP_INP_WUNLOCK(inp); } else { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; } } break; } case SCTP_REMOTE_UDP_ENCAPS_PORT: { struct sctp_udpencaps *encaps; struct sctp_nets *net; struct sockaddr *addr; #if defined(INET) && defined(INET6) struct sockaddr_in sin_store; #endif SCTP_CHECK_AND_CAST(encaps, optval, struct sctp_udpencaps, optsize); SCTP_FIND_STCB(inp, stcb, encaps->sue_assoc_id); #if defined(INET) && defined(INET6) if (encaps->sue_address.ss_family == AF_INET6) { struct sockaddr_in6 *sin6; sin6 = (struct sockaddr_in6 *)&encaps->sue_address; if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { in6_sin6_2_sin(&sin_store, sin6); addr = (struct sockaddr *)&sin_store; } else { addr = (struct sockaddr *)&encaps->sue_address; } } else { addr = (struct sockaddr *)&encaps->sue_address; } #else addr = (struct sockaddr *)&encaps->sue_address; #endif if (stcb != NULL) { net = sctp_findnet(stcb, addr); } else { /* * We increment here since * sctp_findassociation_ep_addr() wil do a * decrement if it finds the stcb as long as * the locked tcb (last argument) is NOT a * TCB.. aka NULL. */ net = NULL; SCTP_INP_INCR_REF(inp); stcb = sctp_findassociation_ep_addr(&inp, addr, &net, NULL, NULL); if (stcb == NULL) { SCTP_INP_DECR_REF(inp); } } if ((stcb != NULL) && (net == NULL)) { #ifdef INET if (addr->sa_family == AF_INET) { struct sockaddr_in *sin; sin = (struct sockaddr_in *)addr; if (sin->sin_addr.s_addr != INADDR_ANY) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); SCTP_TCB_UNLOCK(stcb); error = EINVAL; break; } } else #endif #ifdef INET6 if (addr->sa_family == AF_INET6) { struct sockaddr_in6 *sin6; sin6 = (struct sockaddr_in6 *)addr; if (!IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); SCTP_TCB_UNLOCK(stcb); error = EINVAL; break; } } else #endif { error = EAFNOSUPPORT; SCTP_TCB_UNLOCK(stcb); SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error); break; } } if (stcb != NULL) { if (net != NULL) { net->port = encaps->sue_port; } else { stcb->asoc.port = encaps->sue_port; } SCTP_TCB_UNLOCK(stcb); } else { if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) || (encaps->sue_assoc_id == SCTP_FUTURE_ASSOC)) { SCTP_INP_WLOCK(inp); inp->sctp_ep.port = encaps->sue_port; SCTP_INP_WUNLOCK(inp); } else { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; } } break; } case SCTP_ECN_SUPPORTED: { struct sctp_assoc_value *av; SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, optsize); SCTP_FIND_STCB(inp, stcb, av->assoc_id); if (stcb) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; SCTP_TCB_UNLOCK(stcb); } else { if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) || (av->assoc_id == SCTP_FUTURE_ASSOC)) { SCTP_INP_WLOCK(inp); if (av->assoc_value == 0) { inp->ecn_supported = 0; } else { inp->ecn_supported = 1; } SCTP_INP_WUNLOCK(inp); } else { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; } } break; } case SCTP_PR_SUPPORTED: { struct sctp_assoc_value *av; SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, optsize); SCTP_FIND_STCB(inp, stcb, av->assoc_id); if (stcb) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; SCTP_TCB_UNLOCK(stcb); } else { if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) || (av->assoc_id == SCTP_FUTURE_ASSOC)) { SCTP_INP_WLOCK(inp); if (av->assoc_value == 0) { inp->prsctp_supported = 0; } else { inp->prsctp_supported = 1; } SCTP_INP_WUNLOCK(inp); } else { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; } } break; } case SCTP_AUTH_SUPPORTED: { struct sctp_assoc_value *av; SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, optsize); SCTP_FIND_STCB(inp, stcb, av->assoc_id); if (stcb) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; SCTP_TCB_UNLOCK(stcb); } else { if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) || (av->assoc_id == SCTP_FUTURE_ASSOC)) { if ((av->assoc_value == 0) && (inp->asconf_supported == 1)) { /* * AUTH is required for * ASCONF */ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; } else { SCTP_INP_WLOCK(inp); if (av->assoc_value == 0) { inp->auth_supported = 0; } else { inp->auth_supported = 1; } SCTP_INP_WUNLOCK(inp); } } else { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; } } break; } case SCTP_ASCONF_SUPPORTED: { struct sctp_assoc_value *av; SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, optsize); SCTP_FIND_STCB(inp, stcb, av->assoc_id); if (stcb) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; SCTP_TCB_UNLOCK(stcb); } else { if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) || (av->assoc_id == SCTP_FUTURE_ASSOC)) { if ((av->assoc_value != 0) && (inp->auth_supported == 0)) { /* * AUTH is required for * ASCONF */ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; } else { SCTP_INP_WLOCK(inp); if (av->assoc_value == 0) { inp->asconf_supported = 0; sctp_auth_delete_chunk(SCTP_ASCONF, inp->sctp_ep.local_auth_chunks); sctp_auth_delete_chunk(SCTP_ASCONF_ACK, inp->sctp_ep.local_auth_chunks); } else { inp->asconf_supported = 1; sctp_auth_add_chunk(SCTP_ASCONF, inp->sctp_ep.local_auth_chunks); sctp_auth_add_chunk(SCTP_ASCONF_ACK, inp->sctp_ep.local_auth_chunks); } SCTP_INP_WUNLOCK(inp); } } else { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; } } break; } case SCTP_RECONFIG_SUPPORTED: { struct sctp_assoc_value *av; SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, optsize); SCTP_FIND_STCB(inp, stcb, av->assoc_id); if (stcb) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; SCTP_TCB_UNLOCK(stcb); } else { if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) || (av->assoc_id == SCTP_FUTURE_ASSOC)) { SCTP_INP_WLOCK(inp); if (av->assoc_value == 0) { inp->reconfig_supported = 0; } else { inp->reconfig_supported = 1; } SCTP_INP_WUNLOCK(inp); } else { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; } } break; } case SCTP_NRSACK_SUPPORTED: { struct sctp_assoc_value *av; SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, optsize); SCTP_FIND_STCB(inp, stcb, av->assoc_id); if (stcb) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; SCTP_TCB_UNLOCK(stcb); } else { if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) || (av->assoc_id == SCTP_FUTURE_ASSOC)) { SCTP_INP_WLOCK(inp); if (av->assoc_value == 0) { inp->nrsack_supported = 0; } else { inp->nrsack_supported = 1; } SCTP_INP_WUNLOCK(inp); } else { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; } } break; } case SCTP_PKTDROP_SUPPORTED: { struct sctp_assoc_value *av; SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, optsize); SCTP_FIND_STCB(inp, stcb, av->assoc_id); if (stcb) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; SCTP_TCB_UNLOCK(stcb); } else { if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) || (av->assoc_id == SCTP_FUTURE_ASSOC)) { SCTP_INP_WLOCK(inp); if (av->assoc_value == 0) { inp->pktdrop_supported = 0; } else { inp->pktdrop_supported = 1; } SCTP_INP_WUNLOCK(inp); } else { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; } } break; } case SCTP_MAX_CWND: { struct sctp_assoc_value *av; struct sctp_nets *net; SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, optsize); SCTP_FIND_STCB(inp, stcb, av->assoc_id); if (stcb) { stcb->asoc.max_cwnd = av->assoc_value; if (stcb->asoc.max_cwnd > 0) { TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) { if ((net->cwnd > stcb->asoc.max_cwnd) && (net->cwnd > (net->mtu - sizeof(struct sctphdr)))) { net->cwnd = stcb->asoc.max_cwnd; if (net->cwnd < (net->mtu - sizeof(struct sctphdr))) { net->cwnd = net->mtu - sizeof(struct sctphdr); } } } } SCTP_TCB_UNLOCK(stcb); } else { if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) || (av->assoc_id == SCTP_FUTURE_ASSOC)) { SCTP_INP_WLOCK(inp); inp->max_cwnd = av->assoc_value; SCTP_INP_WUNLOCK(inp); } else { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; } } break; } default: SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOPROTOOPT); error = ENOPROTOOPT; break; } /* end switch (opt) */ return (error); } int sctp_ctloutput(struct socket *so, struct sockopt *sopt) { void *optval = NULL; size_t optsize = 0; void *p; int error = 0; struct sctp_inpcb *inp; if ((sopt->sopt_level == SOL_SOCKET) && (sopt->sopt_name == SO_SETFIB)) { inp = (struct sctp_inpcb *)so->so_pcb; if (inp == NULL) { SCTP_LTRACE_ERR_RET(so->so_pcb, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOBUFS); return (EINVAL); } SCTP_INP_WLOCK(inp); inp->fibnum = so->so_fibnum; SCTP_INP_WUNLOCK(inp); return (0); } if (sopt->sopt_level != IPPROTO_SCTP) { /* wrong proto level... send back up to IP */ #ifdef INET6 if (INP_CHECK_SOCKAF(so, AF_INET6)) error = ip6_ctloutput(so, sopt); #endif /* INET6 */ #if defined(INET) && defined(INET6) else #endif #ifdef INET error = ip_ctloutput(so, sopt); #endif return (error); } optsize = sopt->sopt_valsize; if (optsize) { SCTP_MALLOC(optval, void *, optsize, SCTP_M_SOCKOPT); if (optval == NULL) { SCTP_LTRACE_ERR_RET(so->so_pcb, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOBUFS); return (ENOBUFS); } error = sooptcopyin(sopt, optval, optsize, optsize); if (error) { SCTP_FREE(optval, SCTP_M_SOCKOPT); goto out; } } p = (void *)sopt->sopt_td; if (sopt->sopt_dir == SOPT_SET) { error = sctp_setopt(so, sopt->sopt_name, optval, optsize, p); } else if (sopt->sopt_dir == SOPT_GET) { error = sctp_getopt(so, sopt->sopt_name, optval, &optsize, p); } else { SCTP_LTRACE_ERR_RET(so->so_pcb, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; } if ((error == 0) && (optval != NULL)) { error = sooptcopyout(sopt, optval, optsize); SCTP_FREE(optval, SCTP_M_SOCKOPT); } else if (optval != NULL) { SCTP_FREE(optval, SCTP_M_SOCKOPT); } out: return (error); } #ifdef INET static int sctp_connect(struct socket *so, struct sockaddr *addr, struct thread *p) { int error = 0; int create_lock_on = 0; uint32_t vrf_id; struct sctp_inpcb *inp; struct sctp_tcb *stcb = NULL; inp = (struct sctp_inpcb *)so->so_pcb; if (inp == NULL) { /* I made the same as TCP since we are not setup? */ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); return (ECONNRESET); } if (addr == NULL) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); return EINVAL; } switch (addr->sa_family) { #ifdef INET6 case AF_INET6: { struct sockaddr_in6 *sin6p; if (addr->sa_len != sizeof(struct sockaddr_in6)) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); return (EINVAL); } sin6p = (struct sockaddr_in6 *)addr; if (p != NULL && (error = prison_remote_ip6(p->td_ucred, &sin6p->sin6_addr)) != 0) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error); return (error); } break; } #endif #ifdef INET case AF_INET: { struct sockaddr_in *sinp; if (addr->sa_len != sizeof(struct sockaddr_in)) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); return (EINVAL); } sinp = (struct sockaddr_in *)addr; if (p != NULL && (error = prison_remote_ip4(p->td_ucred, &sinp->sin_addr)) != 0) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error); return (error); } break; } #endif default: SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EAFNOSUPPORT); return (EAFNOSUPPORT); } SCTP_INP_INCR_REF(inp); SCTP_ASOC_CREATE_LOCK(inp); create_lock_on = 1; if ((inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) || (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE)) { /* Should I really unlock ? */ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EFAULT); error = EFAULT; goto out_now; } #ifdef INET6 if (((inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) == 0) && (addr->sa_family == AF_INET6)) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; goto out_now; } #endif if ((inp->sctp_flags & SCTP_PCB_FLAGS_UNBOUND) == SCTP_PCB_FLAGS_UNBOUND) { /* Bind a ephemeral port */ error = sctp_inpcb_bind(so, NULL, NULL, p); if (error) { goto out_now; } } /* Now do we connect? */ if ((inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) && (sctp_is_feature_off(inp, SCTP_PCB_FLAGS_PORTREUSE))) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; goto out_now; } if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) && (inp->sctp_flags & SCTP_PCB_FLAGS_CONNECTED)) { /* We are already connected AND the TCP model */ SCTP_LTRACE_ERR_RET(inp, stcb, NULL, SCTP_FROM_SCTP_USRREQ, EADDRINUSE); error = EADDRINUSE; goto out_now; } if (inp->sctp_flags & SCTP_PCB_FLAGS_CONNECTED) { SCTP_INP_RLOCK(inp); stcb = LIST_FIRST(&inp->sctp_asoc_list); SCTP_INP_RUNLOCK(inp); } else { /* * We increment here since sctp_findassociation_ep_addr() * will do a decrement if it finds the stcb as long as the * locked tcb (last argument) is NOT a TCB.. aka NULL. */ SCTP_INP_INCR_REF(inp); stcb = sctp_findassociation_ep_addr(&inp, addr, NULL, NULL, NULL); if (stcb == NULL) { SCTP_INP_DECR_REF(inp); } else { SCTP_TCB_UNLOCK(stcb); } } if (stcb != NULL) { /* Already have or am bring up an association */ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EALREADY); error = EALREADY; goto out_now; } vrf_id = inp->def_vrf_id; /* We are GOOD to go */ stcb = sctp_aloc_assoc(inp, addr, &error, 0, vrf_id, inp->sctp_ep.pre_open_stream_count, inp->sctp_ep.port, p); if (stcb == NULL) { /* Gak! no memory */ goto out_now; } if (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) { stcb->sctp_ep->sctp_flags |= SCTP_PCB_FLAGS_CONNECTED; /* Set the connected flag so we can queue data */ soisconnecting(so); } SCTP_SET_STATE(&stcb->asoc, SCTP_STATE_COOKIE_WAIT); (void)SCTP_GETTIME_TIMEVAL(&stcb->asoc.time_entered); /* initialize authentication parameters for the assoc */ sctp_initialize_auth_params(inp, stcb); sctp_send_initiate(inp, stcb, SCTP_SO_LOCKED); SCTP_TCB_UNLOCK(stcb); out_now: if (create_lock_on) { SCTP_ASOC_CREATE_UNLOCK(inp); } SCTP_INP_DECR_REF(inp); return (error); } #endif int sctp_listen(struct socket *so, int backlog, struct thread *p) { /* * Note this module depends on the protocol processing being called * AFTER any socket level flags and backlog are applied to the * socket. The traditional way that the socket flags are applied is * AFTER protocol processing. We have made a change to the * sys/kern/uipc_socket.c module to reverse this but this MUST be in * place if the socket API for SCTP is to work properly. */ int error = 0; struct sctp_inpcb *inp; inp = (struct sctp_inpcb *)so->so_pcb; if (inp == NULL) { /* I made the same as TCP since we are not setup? */ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); return (ECONNRESET); } if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_PORTREUSE)) { /* See if we have a listener */ struct sctp_inpcb *tinp; union sctp_sockstore store; if ((inp->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) == 0) { /* not bound all */ struct sctp_laddr *laddr; LIST_FOREACH(laddr, &inp->sctp_addr_list, sctp_nxt_addr) { memcpy(&store, &laddr->ifa->address, sizeof(store)); switch (store.sa.sa_family) { #ifdef INET case AF_INET: store.sin.sin_port = inp->sctp_lport; break; #endif #ifdef INET6 case AF_INET6: store.sin6.sin6_port = inp->sctp_lport; break; #endif default: break; } tinp = sctp_pcb_findep(&store.sa, 0, 0, inp->def_vrf_id); if (tinp && (tinp != inp) && ((tinp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) == 0) && ((tinp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) == 0) && (SCTP_IS_LISTENING(tinp))) { /* * we have a listener already and * its not this inp. */ SCTP_INP_DECR_REF(tinp); return (EADDRINUSE); } else if (tinp) { SCTP_INP_DECR_REF(tinp); } } } else { /* Setup a local addr bound all */ memset(&store, 0, sizeof(store)); #ifdef INET6 if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) { store.sa.sa_family = AF_INET6; store.sa.sa_len = sizeof(struct sockaddr_in6); } #endif #ifdef INET if ((inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) == 0) { store.sa.sa_family = AF_INET; store.sa.sa_len = sizeof(struct sockaddr_in); } #endif switch (store.sa.sa_family) { #ifdef INET case AF_INET: store.sin.sin_port = inp->sctp_lport; break; #endif #ifdef INET6 case AF_INET6: store.sin6.sin6_port = inp->sctp_lport; break; #endif default: break; } tinp = sctp_pcb_findep(&store.sa, 0, 0, inp->def_vrf_id); if (tinp && (tinp != inp) && ((tinp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) == 0) && ((tinp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) == 0) && (SCTP_IS_LISTENING(tinp))) { /* * we have a listener already and its not * this inp. */ SCTP_INP_DECR_REF(tinp); return (EADDRINUSE); } else if (tinp) { SCTP_INP_DECR_REF(tinp); } } } SCTP_INP_RLOCK(inp); #ifdef SCTP_LOCK_LOGGING if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LOCK_LOGGING_ENABLE) { sctp_log_lock(inp, (struct sctp_tcb *)NULL, SCTP_LOG_LOCK_SOCK); } #endif SOCK_LOCK(so); error = solisten_proto_check(so); SOCK_UNLOCK(so); if (error) { SCTP_INP_RUNLOCK(inp); return (error); } if ((sctp_is_feature_on(inp, SCTP_PCB_FLAGS_PORTREUSE)) && (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL)) { /* * The unlucky case - We are in the tcp pool with this guy. * - Someone else is in the main inp slot. - We must move * this guy (the listener) to the main slot - We must then * move the guy that was listener to the TCP Pool. */ if (sctp_swap_inpcb_for_listen(inp)) { SCTP_INP_RUNLOCK(inp); SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EADDRINUSE); return (EADDRINUSE); } } if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) && (inp->sctp_flags & SCTP_PCB_FLAGS_CONNECTED)) { /* We are already connected AND the TCP model */ SCTP_INP_RUNLOCK(inp); SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EADDRINUSE); return (EADDRINUSE); } SCTP_INP_RUNLOCK(inp); if (inp->sctp_flags & SCTP_PCB_FLAGS_UNBOUND) { /* We must do a bind. */ if ((error = sctp_inpcb_bind(so, NULL, NULL, p))) { /* bind error, probably perm */ return (error); } } SCTP_INP_WLOCK(inp); - SOCK_LOCK(so); - /* It appears for 7.0 and on, we must always call this. */ - solisten_proto(so, backlog); - if (inp->sctp_flags & SCTP_PCB_FLAGS_UDPTYPE) { - /* remove the ACCEPTCONN flag for one-to-many sockets */ - so->so_options &= ~SO_ACCEPTCONN; + if ((inp->sctp_flags & SCTP_PCB_FLAGS_UDPTYPE) == 0) { + SOCK_LOCK(so); + solisten_proto(so, backlog); + SOCK_UNLOCK(so); } - if (backlog > 0) { - inp->sctp_flags |= SCTP_PCB_FLAGS_ACCEPTING; - } else { - inp->sctp_flags &= ~SCTP_PCB_FLAGS_ACCEPTING; - } - SOCK_UNLOCK(so); + inp->sctp_flags |= SCTP_PCB_FLAGS_ACCEPTING; SCTP_INP_WUNLOCK(inp); return (error); } static int sctp_defered_wakeup_cnt = 0; int sctp_accept(struct socket *so, struct sockaddr **addr) { struct sctp_tcb *stcb; struct sctp_inpcb *inp; union sctp_sockstore store; #ifdef INET6 int error; #endif inp = (struct sctp_inpcb *)so->so_pcb; if (inp == NULL) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); return (ECONNRESET); } SCTP_INP_RLOCK(inp); if (inp->sctp_flags & SCTP_PCB_FLAGS_UDPTYPE) { SCTP_INP_RUNLOCK(inp); SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EOPNOTSUPP); return (EOPNOTSUPP); } if (so->so_state & SS_ISDISCONNECTED) { SCTP_INP_RUNLOCK(inp); SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ECONNABORTED); return (ECONNABORTED); } stcb = LIST_FIRST(&inp->sctp_asoc_list); if (stcb == NULL) { SCTP_INP_RUNLOCK(inp); SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); return (ECONNRESET); } SCTP_TCB_LOCK(stcb); SCTP_INP_RUNLOCK(inp); store = stcb->asoc.primary_destination->ro._l_addr; stcb->asoc.state &= ~SCTP_STATE_IN_ACCEPT_QUEUE; SCTP_TCB_UNLOCK(stcb); switch (store.sa.sa_family) { #ifdef INET case AF_INET: { struct sockaddr_in *sin; SCTP_MALLOC_SONAME(sin, struct sockaddr_in *, sizeof *sin); if (sin == NULL) return (ENOMEM); sin->sin_family = AF_INET; sin->sin_len = sizeof(*sin); sin->sin_port = store.sin.sin_port; sin->sin_addr = store.sin.sin_addr; *addr = (struct sockaddr *)sin; break; } #endif #ifdef INET6 case AF_INET6: { struct sockaddr_in6 *sin6; SCTP_MALLOC_SONAME(sin6, struct sockaddr_in6 *, sizeof *sin6); if (sin6 == NULL) return (ENOMEM); sin6->sin6_family = AF_INET6; sin6->sin6_len = sizeof(*sin6); sin6->sin6_port = store.sin6.sin6_port; sin6->sin6_addr = store.sin6.sin6_addr; if ((error = sa6_recoverscope(sin6)) != 0) { SCTP_FREE_SONAME(sin6); return (error); } *addr = (struct sockaddr *)sin6; break; } #endif default: /* TSNH */ break; } /* Wake any delayed sleep action */ if (inp->sctp_flags & SCTP_PCB_FLAGS_DONT_WAKE) { SCTP_INP_WLOCK(inp); inp->sctp_flags &= ~SCTP_PCB_FLAGS_DONT_WAKE; if (inp->sctp_flags & SCTP_PCB_FLAGS_WAKEOUTPUT) { inp->sctp_flags &= ~SCTP_PCB_FLAGS_WAKEOUTPUT; SCTP_INP_WUNLOCK(inp); SOCKBUF_LOCK(&inp->sctp_socket->so_snd); if (sowriteable(inp->sctp_socket)) { sowwakeup_locked(inp->sctp_socket); } else { SOCKBUF_UNLOCK(&inp->sctp_socket->so_snd); } SCTP_INP_WLOCK(inp); } if (inp->sctp_flags & SCTP_PCB_FLAGS_WAKEINPUT) { inp->sctp_flags &= ~SCTP_PCB_FLAGS_WAKEINPUT; SCTP_INP_WUNLOCK(inp); SOCKBUF_LOCK(&inp->sctp_socket->so_rcv); if (soreadable(inp->sctp_socket)) { sctp_defered_wakeup_cnt++; sorwakeup_locked(inp->sctp_socket); } else { SOCKBUF_UNLOCK(&inp->sctp_socket->so_rcv); } SCTP_INP_WLOCK(inp); } SCTP_INP_WUNLOCK(inp); } if (stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) { SCTP_TCB_LOCK(stcb); sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTP_USRREQ + SCTP_LOC_19); } return (0); } #ifdef INET int sctp_ingetaddr(struct socket *so, struct sockaddr **addr) { struct sockaddr_in *sin; uint32_t vrf_id; struct sctp_inpcb *inp; struct sctp_ifa *sctp_ifa; /* * Do the malloc first in case it blocks. */ SCTP_MALLOC_SONAME(sin, struct sockaddr_in *, sizeof *sin); if (sin == NULL) return (ENOMEM); sin->sin_family = AF_INET; sin->sin_len = sizeof(*sin); inp = (struct sctp_inpcb *)so->so_pcb; if (!inp) { SCTP_FREE_SONAME(sin); SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); return (ECONNRESET); } SCTP_INP_RLOCK(inp); sin->sin_port = inp->sctp_lport; if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) { if (inp->sctp_flags & SCTP_PCB_FLAGS_CONNECTED) { struct sctp_tcb *stcb; struct sockaddr_in *sin_a; struct sctp_nets *net; int fnd; stcb = LIST_FIRST(&inp->sctp_asoc_list); if (stcb == NULL) { goto notConn; } fnd = 0; sin_a = NULL; SCTP_TCB_LOCK(stcb); TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) { sin_a = (struct sockaddr_in *)&net->ro._l_addr; if (sin_a == NULL) /* this will make coverity happy */ continue; if (sin_a->sin_family == AF_INET) { fnd = 1; break; } } if ((!fnd) || (sin_a == NULL)) { /* punt */ SCTP_TCB_UNLOCK(stcb); goto notConn; } vrf_id = inp->def_vrf_id; sctp_ifa = sctp_source_address_selection(inp, stcb, (sctp_route_t *)&net->ro, net, 0, vrf_id); if (sctp_ifa) { sin->sin_addr = sctp_ifa->address.sin.sin_addr; sctp_free_ifa(sctp_ifa); } SCTP_TCB_UNLOCK(stcb); } else { /* For the bound all case you get back 0 */ notConn: sin->sin_addr.s_addr = 0; } } else { /* Take the first IPv4 address in the list */ struct sctp_laddr *laddr; int fnd = 0; LIST_FOREACH(laddr, &inp->sctp_addr_list, sctp_nxt_addr) { if (laddr->ifa->address.sa.sa_family == AF_INET) { struct sockaddr_in *sin_a; sin_a = &laddr->ifa->address.sin; sin->sin_addr = sin_a->sin_addr; fnd = 1; break; } } if (!fnd) { SCTP_FREE_SONAME(sin); SCTP_INP_RUNLOCK(inp); SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOENT); return (ENOENT); } } SCTP_INP_RUNLOCK(inp); (*addr) = (struct sockaddr *)sin; return (0); } int sctp_peeraddr(struct socket *so, struct sockaddr **addr) { struct sockaddr_in *sin; int fnd; struct sockaddr_in *sin_a; struct sctp_inpcb *inp; struct sctp_tcb *stcb; struct sctp_nets *net; /* Do the malloc first in case it blocks. */ SCTP_MALLOC_SONAME(sin, struct sockaddr_in *, sizeof *sin); if (sin == NULL) return (ENOMEM); sin->sin_family = AF_INET; sin->sin_len = sizeof(*sin); inp = (struct sctp_inpcb *)so->so_pcb; if ((inp == NULL) || ((inp->sctp_flags & SCTP_PCB_FLAGS_CONNECTED) == 0)) { /* UDP type and listeners will drop out here */ SCTP_FREE_SONAME(sin); SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOTCONN); return (ENOTCONN); } SCTP_INP_RLOCK(inp); stcb = LIST_FIRST(&inp->sctp_asoc_list); if (stcb) { SCTP_TCB_LOCK(stcb); } SCTP_INP_RUNLOCK(inp); if (stcb == NULL) { SCTP_FREE_SONAME(sin); SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); return (ECONNRESET); } fnd = 0; TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) { sin_a = (struct sockaddr_in *)&net->ro._l_addr; if (sin_a->sin_family == AF_INET) { fnd = 1; sin->sin_port = stcb->rport; sin->sin_addr = sin_a->sin_addr; break; } } SCTP_TCB_UNLOCK(stcb); if (!fnd) { /* No IPv4 address */ SCTP_FREE_SONAME(sin); SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOENT); return (ENOENT); } (*addr) = (struct sockaddr *)sin; return (0); } struct pr_usrreqs sctp_usrreqs = { .pru_abort = sctp_abort, .pru_accept = sctp_accept, .pru_attach = sctp_attach, .pru_bind = sctp_bind, .pru_connect = sctp_connect, .pru_control = in_control, .pru_close = sctp_close, .pru_detach = sctp_close, .pru_sopoll = sopoll_generic, .pru_flush = sctp_flush, .pru_disconnect = sctp_disconnect, .pru_listen = sctp_listen, .pru_peeraddr = sctp_peeraddr, .pru_send = sctp_sendm, .pru_shutdown = sctp_shutdown, .pru_sockaddr = sctp_ingetaddr, .pru_sosend = sctp_sosend, .pru_soreceive = sctp_soreceive }; #endif Index: head/sys/netinet/tcp_subr.c =================================================================== --- head/sys/netinet/tcp_subr.c (revision 319721) +++ head/sys/netinet/tcp_subr.c (revision 319722) @@ -1,2888 +1,2887 @@ /*- * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)tcp_subr.c 8.2 (Berkeley) 5/24/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_compat.h" #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_tcpdebug.h" #include #include #include #include #ifdef TCP_HHOOK #include #endif #include #ifdef TCP_HHOOK #include #endif #include #include #include #include #include #ifdef INET6 #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #include #include #include #include #include #include #endif #ifdef TCP_RFC7413 #include #endif #include #include #include #include #include #include #include #ifdef INET6 #include #endif #include #ifdef TCPPCAP #include #endif #ifdef TCPDEBUG #include #endif #ifdef INET6 #include #endif #ifdef TCP_OFFLOAD #include #endif #include #include #include #include VNET_DEFINE(int, tcp_mssdflt) = TCP_MSS; #ifdef INET6 VNET_DEFINE(int, tcp_v6mssdflt) = TCP6_MSS; #endif struct rwlock tcp_function_lock; static int sysctl_net_inet_tcp_mss_check(SYSCTL_HANDLER_ARGS) { int error, new; new = V_tcp_mssdflt; error = sysctl_handle_int(oidp, &new, 0, req); if (error == 0 && req->newptr) { if (new < TCP_MINMSS) error = EINVAL; else V_tcp_mssdflt = new; } return (error); } SYSCTL_PROC(_net_inet_tcp, TCPCTL_MSSDFLT, mssdflt, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW, &VNET_NAME(tcp_mssdflt), 0, &sysctl_net_inet_tcp_mss_check, "I", "Default TCP Maximum Segment Size"); #ifdef INET6 static int sysctl_net_inet_tcp_mss_v6_check(SYSCTL_HANDLER_ARGS) { int error, new; new = V_tcp_v6mssdflt; error = sysctl_handle_int(oidp, &new, 0, req); if (error == 0 && req->newptr) { if (new < TCP_MINMSS) error = EINVAL; else V_tcp_v6mssdflt = new; } return (error); } SYSCTL_PROC(_net_inet_tcp, TCPCTL_V6MSSDFLT, v6mssdflt, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW, &VNET_NAME(tcp_v6mssdflt), 0, &sysctl_net_inet_tcp_mss_v6_check, "I", "Default TCP Maximum Segment Size for IPv6"); #endif /* INET6 */ /* * Minimum MSS we accept and use. This prevents DoS attacks where * we are forced to a ridiculous low MSS like 20 and send hundreds * of packets instead of one. The effect scales with the available * bandwidth and quickly saturates the CPU and network interface * with packet generation and sending. Set to zero to disable MINMSS * checking. This setting prevents us from sending too small packets. */ VNET_DEFINE(int, tcp_minmss) = TCP_MINMSS; SYSCTL_INT(_net_inet_tcp, OID_AUTO, minmss, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_minmss), 0, "Minimum TCP Maximum Segment Size"); VNET_DEFINE(int, tcp_do_rfc1323) = 1; SYSCTL_INT(_net_inet_tcp, TCPCTL_DO_RFC1323, rfc1323, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_do_rfc1323), 0, "Enable rfc1323 (high performance TCP) extensions"); static int tcp_log_debug = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_debug, CTLFLAG_RW, &tcp_log_debug, 0, "Log errors caused by incoming TCP segments"); static int tcp_tcbhashsize; SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcbhashsize, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &tcp_tcbhashsize, 0, "Size of TCP control-block hashtable"); static int do_tcpdrain = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, do_tcpdrain, CTLFLAG_RW, &do_tcpdrain, 0, "Enable tcp_drain routine for extra help when low on mbufs"); SYSCTL_UINT(_net_inet_tcp, OID_AUTO, pcbcount, CTLFLAG_VNET | CTLFLAG_RD, &VNET_NAME(tcbinfo.ipi_count), 0, "Number of active PCBs"); static VNET_DEFINE(int, icmp_may_rst) = 1; #define V_icmp_may_rst VNET(icmp_may_rst) SYSCTL_INT(_net_inet_tcp, OID_AUTO, icmp_may_rst, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(icmp_may_rst), 0, "Certain ICMP unreachable messages may abort connections in SYN_SENT"); static VNET_DEFINE(int, tcp_isn_reseed_interval) = 0; #define V_tcp_isn_reseed_interval VNET(tcp_isn_reseed_interval) SYSCTL_INT(_net_inet_tcp, OID_AUTO, isn_reseed_interval, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_isn_reseed_interval), 0, "Seconds between reseeding of ISN secret"); static int tcp_soreceive_stream; SYSCTL_INT(_net_inet_tcp, OID_AUTO, soreceive_stream, CTLFLAG_RDTUN, &tcp_soreceive_stream, 0, "Using soreceive_stream for TCP sockets"); VNET_DEFINE(uma_zone_t, sack_hole_zone); #define V_sack_hole_zone VNET(sack_hole_zone) #ifdef TCP_HHOOK VNET_DEFINE(struct hhook_head *, tcp_hhh[HHOOK_TCP_LAST+1]); #endif static struct inpcb *tcp_notify(struct inpcb *, int); static struct inpcb *tcp_mtudisc_notify(struct inpcb *, int); static void tcp_mtudisc(struct inpcb *, int); static char * tcp_log_addr(struct in_conninfo *inc, struct tcphdr *th, void *ip4hdr, const void *ip6hdr); static struct tcp_function_block tcp_def_funcblk = { "default", tcp_output, tcp_do_segment, tcp_default_ctloutput, NULL, NULL, NULL, NULL, NULL, NULL, 0, 0 }; int t_functions_inited = 0; struct tcp_funchead t_functions; static struct tcp_function_block *tcp_func_set_ptr = &tcp_def_funcblk; static void init_tcp_functions(void) { if (t_functions_inited == 0) { TAILQ_INIT(&t_functions); rw_init_flags(&tcp_function_lock, "tcp_func_lock" , 0); t_functions_inited = 1; } } static struct tcp_function_block * find_tcp_functions_locked(struct tcp_function_set *fs) { struct tcp_function *f; struct tcp_function_block *blk=NULL; TAILQ_FOREACH(f, &t_functions, tf_next) { if (strcmp(f->tf_name, fs->function_set_name) == 0) { blk = f->tf_fb; break; } } return(blk); } static struct tcp_function_block * find_tcp_fb_locked(struct tcp_function_block *blk, struct tcp_function **s) { struct tcp_function_block *rblk=NULL; struct tcp_function *f; TAILQ_FOREACH(f, &t_functions, tf_next) { if (f->tf_fb == blk) { rblk = blk; if (s) { *s = f; } break; } } return (rblk); } struct tcp_function_block * find_and_ref_tcp_functions(struct tcp_function_set *fs) { struct tcp_function_block *blk; rw_rlock(&tcp_function_lock); blk = find_tcp_functions_locked(fs); if (blk) refcount_acquire(&blk->tfb_refcnt); rw_runlock(&tcp_function_lock); return(blk); } struct tcp_function_block * find_and_ref_tcp_fb(struct tcp_function_block *blk) { struct tcp_function_block *rblk; rw_rlock(&tcp_function_lock); rblk = find_tcp_fb_locked(blk, NULL); if (rblk) refcount_acquire(&rblk->tfb_refcnt); rw_runlock(&tcp_function_lock); return(rblk); } static int sysctl_net_inet_default_tcp_functions(SYSCTL_HANDLER_ARGS) { int error=ENOENT; struct tcp_function_set fs; struct tcp_function_block *blk; memset(&fs, 0, sizeof(fs)); rw_rlock(&tcp_function_lock); blk = find_tcp_fb_locked(tcp_func_set_ptr, NULL); if (blk) { /* Found him */ strcpy(fs.function_set_name, blk->tfb_tcp_block_name); fs.pcbcnt = blk->tfb_refcnt; } rw_runlock(&tcp_function_lock); error = sysctl_handle_string(oidp, fs.function_set_name, sizeof(fs.function_set_name), req); /* Check for error or no change */ if (error != 0 || req->newptr == NULL) return(error); rw_wlock(&tcp_function_lock); blk = find_tcp_functions_locked(&fs); if ((blk == NULL) || (blk->tfb_flags & TCP_FUNC_BEING_REMOVED)) { error = ENOENT; goto done; } tcp_func_set_ptr = blk; done: rw_wunlock(&tcp_function_lock); return (error); } SYSCTL_PROC(_net_inet_tcp, OID_AUTO, functions_default, CTLTYPE_STRING | CTLFLAG_RW, NULL, 0, sysctl_net_inet_default_tcp_functions, "A", "Set/get the default TCP functions"); static int sysctl_net_inet_list_available(SYSCTL_HANDLER_ARGS) { int error, cnt, linesz; struct tcp_function *f; char *buffer, *cp; size_t bufsz, outsz; bool alias; cnt = 0; rw_rlock(&tcp_function_lock); TAILQ_FOREACH(f, &t_functions, tf_next) { cnt++; } rw_runlock(&tcp_function_lock); bufsz = (cnt+2) * ((TCP_FUNCTION_NAME_LEN_MAX * 2) + 13) + 1; buffer = malloc(bufsz, M_TEMP, M_WAITOK); error = 0; cp = buffer; linesz = snprintf(cp, bufsz, "\n%-32s%c %-32s %s\n", "Stack", 'D', "Alias", "PCB count"); cp += linesz; bufsz -= linesz; outsz = linesz; rw_rlock(&tcp_function_lock); TAILQ_FOREACH(f, &t_functions, tf_next) { alias = (f->tf_name != f->tf_fb->tfb_tcp_block_name); linesz = snprintf(cp, bufsz, "%-32s%c %-32s %u\n", f->tf_fb->tfb_tcp_block_name, (f->tf_fb == tcp_func_set_ptr) ? '*' : ' ', alias ? f->tf_name : "-", f->tf_fb->tfb_refcnt); if (linesz >= bufsz) { error = EOVERFLOW; break; } cp += linesz; bufsz -= linesz; outsz += linesz; } rw_runlock(&tcp_function_lock); if (error == 0) error = sysctl_handle_string(oidp, buffer, outsz + 1, req); free(buffer, M_TEMP); return (error); } SYSCTL_PROC(_net_inet_tcp, OID_AUTO, functions_available, CTLTYPE_STRING|CTLFLAG_RD, NULL, 0, sysctl_net_inet_list_available, "A", "list available TCP Function sets"); /* * Target size of TCP PCB hash tables. Must be a power of two. * * Note that this can be overridden by the kernel environment * variable net.inet.tcp.tcbhashsize */ #ifndef TCBHASHSIZE #define TCBHASHSIZE 0 #endif /* * XXX * Callouts should be moved into struct tcp directly. They are currently * separate because the tcpcb structure is exported to userland for sysctl * parsing purposes, which do not know about callouts. */ struct tcpcb_mem { struct tcpcb tcb; struct tcp_timer tt; struct cc_var ccv; #ifdef TCP_HHOOK struct osd osd; #endif }; static VNET_DEFINE(uma_zone_t, tcpcb_zone); #define V_tcpcb_zone VNET(tcpcb_zone) MALLOC_DEFINE(M_TCPLOG, "tcplog", "TCP address and flags print buffers"); MALLOC_DEFINE(M_TCPFUNCTIONS, "tcpfunc", "TCP function set memory"); static struct mtx isn_mtx; #define ISN_LOCK_INIT() mtx_init(&isn_mtx, "isn_mtx", NULL, MTX_DEF) #define ISN_LOCK() mtx_lock(&isn_mtx) #define ISN_UNLOCK() mtx_unlock(&isn_mtx) /* * TCP initialization. */ static void tcp_zone_change(void *tag) { uma_zone_set_max(V_tcbinfo.ipi_zone, maxsockets); uma_zone_set_max(V_tcpcb_zone, maxsockets); tcp_tw_zone_change(); } static int tcp_inpcb_init(void *mem, int size, int flags) { struct inpcb *inp = mem; INP_LOCK_INIT(inp, "inp", "tcpinp"); return (0); } /* * Take a value and get the next power of 2 that doesn't overflow. * Used to size the tcp_inpcb hash buckets. */ static int maketcp_hashsize(int size) { int hashsize; /* * auto tune. * get the next power of 2 higher than maxsockets. */ hashsize = 1 << fls(size); /* catch overflow, and just go one power of 2 smaller */ if (hashsize < size) { hashsize = 1 << (fls(size) - 1); } return (hashsize); } /* * Register a TCP function block with the name provided in the names * array. (Note that this function does NOT automatically register * blk->tfb_tcp_block_name as a stack name. Therefore, you should * explicitly include blk->tfb_tcp_block_name in the list of names if * you wish to register the stack with that name.) * * Either all name registrations will succeed or all will fail. If * a name registration fails, the function will update the num_names * argument to point to the array index of the name that encountered * the failure. * * Returns 0 on success, or an error code on failure. */ int register_tcp_functions_as_names(struct tcp_function_block *blk, int wait, const char *names[], int *num_names) { struct tcp_function *n; struct tcp_function_set fs; int error, i; KASSERT(names != NULL && *num_names > 0, ("%s: Called with 0-length name list", __func__)); KASSERT(names != NULL, ("%s: Called with NULL name list", __func__)); if (t_functions_inited == 0) { init_tcp_functions(); } if ((blk->tfb_tcp_output == NULL) || (blk->tfb_tcp_do_segment == NULL) || (blk->tfb_tcp_ctloutput == NULL) || (strlen(blk->tfb_tcp_block_name) == 0)) { /* * These functions are required and you * need a name. */ *num_names = 0; return (EINVAL); } if (blk->tfb_tcp_timer_stop_all || blk->tfb_tcp_timer_activate || blk->tfb_tcp_timer_active || blk->tfb_tcp_timer_stop) { /* * If you define one timer function you * must have them all. */ if ((blk->tfb_tcp_timer_stop_all == NULL) || (blk->tfb_tcp_timer_activate == NULL) || (blk->tfb_tcp_timer_active == NULL) || (blk->tfb_tcp_timer_stop == NULL)) { *num_names = 0; return (EINVAL); } } refcount_init(&blk->tfb_refcnt, 0); blk->tfb_flags = 0; for (i = 0; i < *num_names; i++) { n = malloc(sizeof(struct tcp_function), M_TCPFUNCTIONS, wait); if (n == NULL) { error = ENOMEM; goto cleanup; } n->tf_fb = blk; (void)strncpy(fs.function_set_name, names[i], TCP_FUNCTION_NAME_LEN_MAX); fs.function_set_name[TCP_FUNCTION_NAME_LEN_MAX - 1] = '\0'; rw_wlock(&tcp_function_lock); if (find_tcp_functions_locked(&fs) != NULL) { /* Duplicate name space not allowed */ rw_wunlock(&tcp_function_lock); free(n, M_TCPFUNCTIONS); error = EALREADY; goto cleanup; } (void)strncpy(n->tf_name, names[i], TCP_FUNCTION_NAME_LEN_MAX); n->tf_name[TCP_FUNCTION_NAME_LEN_MAX - 1] = '\0'; TAILQ_INSERT_TAIL(&t_functions, n, tf_next); rw_wunlock(&tcp_function_lock); } return(0); cleanup: /* * Deregister the names we just added. Because registration failed * for names[i], we don't need to deregister that name. */ *num_names = i; rw_wlock(&tcp_function_lock); while (--i >= 0) { TAILQ_FOREACH(n, &t_functions, tf_next) { if (!strncmp(n->tf_name, names[i], TCP_FUNCTION_NAME_LEN_MAX)) { TAILQ_REMOVE(&t_functions, n, tf_next); n->tf_fb = NULL; free(n, M_TCPFUNCTIONS); break; } } } rw_wunlock(&tcp_function_lock); return (error); } /* * Register a TCP function block using the name provided in the name * argument. * * Returns 0 on success, or an error code on failure. */ int register_tcp_functions_as_name(struct tcp_function_block *blk, const char *name, int wait) { const char *name_list[1]; int num_names, rv; num_names = 1; if (name != NULL) name_list[0] = name; else name_list[0] = blk->tfb_tcp_block_name; rv = register_tcp_functions_as_names(blk, wait, name_list, &num_names); return (rv); } /* * Register a TCP function block using the name defined in * blk->tfb_tcp_block_name. * * Returns 0 on success, or an error code on failure. */ int register_tcp_functions(struct tcp_function_block *blk, int wait) { return (register_tcp_functions_as_name(blk, NULL, wait)); } int deregister_tcp_functions(struct tcp_function_block *blk) { struct tcp_function *f; int error=ENOENT; if (strcmp(blk->tfb_tcp_block_name, "default") == 0) { /* You can't un-register the default */ return (EPERM); } rw_wlock(&tcp_function_lock); if (blk == tcp_func_set_ptr) { /* You can't free the current default */ rw_wunlock(&tcp_function_lock); return (EBUSY); } if (blk->tfb_refcnt) { /* Still tcb attached, mark it. */ blk->tfb_flags |= TCP_FUNC_BEING_REMOVED; rw_wunlock(&tcp_function_lock); return (EBUSY); } while (find_tcp_fb_locked(blk, &f) != NULL) { /* Found */ TAILQ_REMOVE(&t_functions, f, tf_next); f->tf_fb = NULL; free(f, M_TCPFUNCTIONS); error = 0; } rw_wunlock(&tcp_function_lock); return (error); } void tcp_init(void) { const char *tcbhash_tuneable; int hashsize; tcbhash_tuneable = "net.inet.tcp.tcbhashsize"; #ifdef TCP_HHOOK if (hhook_head_register(HHOOK_TYPE_TCP, HHOOK_TCP_EST_IN, &V_tcp_hhh[HHOOK_TCP_EST_IN], HHOOK_NOWAIT|HHOOK_HEADISINVNET) != 0) printf("%s: WARNING: unable to register helper hook\n", __func__); if (hhook_head_register(HHOOK_TYPE_TCP, HHOOK_TCP_EST_OUT, &V_tcp_hhh[HHOOK_TCP_EST_OUT], HHOOK_NOWAIT|HHOOK_HEADISINVNET) != 0) printf("%s: WARNING: unable to register helper hook\n", __func__); #endif hashsize = TCBHASHSIZE; TUNABLE_INT_FETCH(tcbhash_tuneable, &hashsize); if (hashsize == 0) { /* * Auto tune the hash size based on maxsockets. * A perfect hash would have a 1:1 mapping * (hashsize = maxsockets) however it's been * suggested that O(2) average is better. */ hashsize = maketcp_hashsize(maxsockets / 4); /* * Our historical default is 512, * do not autotune lower than this. */ if (hashsize < 512) hashsize = 512; if (bootverbose && IS_DEFAULT_VNET(curvnet)) printf("%s: %s auto tuned to %d\n", __func__, tcbhash_tuneable, hashsize); } /* * We require a hashsize to be a power of two. * Previously if it was not a power of two we would just reset it * back to 512, which could be a nasty surprise if you did not notice * the error message. * Instead what we do is clip it to the closest power of two lower * than the specified hash value. */ if (!powerof2(hashsize)) { int oldhashsize = hashsize; hashsize = maketcp_hashsize(hashsize); /* prevent absurdly low value */ if (hashsize < 16) hashsize = 16; printf("%s: WARNING: TCB hash size not a power of 2, " "clipped from %d to %d.\n", __func__, oldhashsize, hashsize); } in_pcbinfo_init(&V_tcbinfo, "tcp", &V_tcb, hashsize, hashsize, "tcp_inpcb", tcp_inpcb_init, IPI_HASHFIELDS_4TUPLE); /* * These have to be type stable for the benefit of the timers. */ V_tcpcb_zone = uma_zcreate("tcpcb", sizeof(struct tcpcb_mem), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); uma_zone_set_max(V_tcpcb_zone, maxsockets); uma_zone_set_warning(V_tcpcb_zone, "kern.ipc.maxsockets limit reached"); tcp_tw_init(); syncache_init(); tcp_hc_init(); TUNABLE_INT_FETCH("net.inet.tcp.sack.enable", &V_tcp_do_sack); V_sack_hole_zone = uma_zcreate("sackhole", sizeof(struct sackhole), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); #ifdef TCP_RFC7413 tcp_fastopen_init(); #endif /* Skip initialization of globals for non-default instances. */ if (!IS_DEFAULT_VNET(curvnet)) return; tcp_reass_global_init(); /* XXX virtualize those bellow? */ tcp_delacktime = TCPTV_DELACK; tcp_keepinit = TCPTV_KEEP_INIT; tcp_keepidle = TCPTV_KEEP_IDLE; tcp_keepintvl = TCPTV_KEEPINTVL; tcp_maxpersistidle = TCPTV_KEEP_IDLE; tcp_msl = TCPTV_MSL; tcp_rexmit_min = TCPTV_MIN; if (tcp_rexmit_min < 1) tcp_rexmit_min = 1; tcp_persmin = TCPTV_PERSMIN; tcp_persmax = TCPTV_PERSMAX; tcp_rexmit_slop = TCPTV_CPU_VAR; tcp_finwait2_timeout = TCPTV_FINWAIT2_TIMEOUT; tcp_tcbhashsize = hashsize; /* Setup the tcp function block list */ init_tcp_functions(); register_tcp_functions(&tcp_def_funcblk, M_WAITOK); if (tcp_soreceive_stream) { #ifdef INET tcp_usrreqs.pru_soreceive = soreceive_stream; #endif #ifdef INET6 tcp6_usrreqs.pru_soreceive = soreceive_stream; #endif /* INET6 */ } #ifdef INET6 #define TCP_MINPROTOHDR (sizeof(struct ip6_hdr) + sizeof(struct tcphdr)) #else /* INET6 */ #define TCP_MINPROTOHDR (sizeof(struct tcpiphdr)) #endif /* INET6 */ if (max_protohdr < TCP_MINPROTOHDR) max_protohdr = TCP_MINPROTOHDR; if (max_linkhdr + TCP_MINPROTOHDR > MHLEN) panic("tcp_init"); #undef TCP_MINPROTOHDR ISN_LOCK_INIT(); EVENTHANDLER_REGISTER(shutdown_pre_sync, tcp_fini, NULL, SHUTDOWN_PRI_DEFAULT); EVENTHANDLER_REGISTER(maxsockets_change, tcp_zone_change, NULL, EVENTHANDLER_PRI_ANY); #ifdef TCPPCAP tcp_pcap_init(); #endif } #ifdef VIMAGE static void tcp_destroy(void *unused __unused) { int n; #ifdef TCP_HHOOK int error; #endif /* * All our processes are gone, all our sockets should be cleaned * up, which means, we should be past the tcp_discardcb() calls. * Sleep to let all tcpcb timers really disappear and cleanup. */ for (;;) { INP_LIST_RLOCK(&V_tcbinfo); n = V_tcbinfo.ipi_count; INP_LIST_RUNLOCK(&V_tcbinfo); if (n == 0) break; pause("tcpdes", hz / 10); } tcp_hc_destroy(); syncache_destroy(); tcp_tw_destroy(); in_pcbinfo_destroy(&V_tcbinfo); /* tcp_discardcb() clears the sack_holes up. */ uma_zdestroy(V_sack_hole_zone); uma_zdestroy(V_tcpcb_zone); #ifdef TCP_RFC7413 /* * Cannot free the zone until all tcpcbs are released as we attach * the allocations to them. */ tcp_fastopen_destroy(); #endif #ifdef TCP_HHOOK error = hhook_head_deregister(V_tcp_hhh[HHOOK_TCP_EST_IN]); if (error != 0) { printf("%s: WARNING: unable to deregister helper hook " "type=%d, id=%d: error %d returned\n", __func__, HHOOK_TYPE_TCP, HHOOK_TCP_EST_IN, error); } error = hhook_head_deregister(V_tcp_hhh[HHOOK_TCP_EST_OUT]); if (error != 0) { printf("%s: WARNING: unable to deregister helper hook " "type=%d, id=%d: error %d returned\n", __func__, HHOOK_TYPE_TCP, HHOOK_TCP_EST_OUT, error); } #endif } VNET_SYSUNINIT(tcp, SI_SUB_PROTO_DOMAIN, SI_ORDER_FOURTH, tcp_destroy, NULL); #endif void tcp_fini(void *xtp) { } /* * Fill in the IP and TCP headers for an outgoing packet, given the tcpcb. * tcp_template used to store this data in mbufs, but we now recopy it out * of the tcpcb each time to conserve mbufs. */ void tcpip_fillheaders(struct inpcb *inp, void *ip_ptr, void *tcp_ptr) { struct tcphdr *th = (struct tcphdr *)tcp_ptr; INP_WLOCK_ASSERT(inp); #ifdef INET6 if ((inp->inp_vflag & INP_IPV6) != 0) { struct ip6_hdr *ip6; ip6 = (struct ip6_hdr *)ip_ptr; ip6->ip6_flow = (ip6->ip6_flow & ~IPV6_FLOWINFO_MASK) | (inp->inp_flow & IPV6_FLOWINFO_MASK); ip6->ip6_vfc = (ip6->ip6_vfc & ~IPV6_VERSION_MASK) | (IPV6_VERSION & IPV6_VERSION_MASK); ip6->ip6_nxt = IPPROTO_TCP; ip6->ip6_plen = htons(sizeof(struct tcphdr)); ip6->ip6_src = inp->in6p_laddr; ip6->ip6_dst = inp->in6p_faddr; } #endif /* INET6 */ #if defined(INET6) && defined(INET) else #endif #ifdef INET { struct ip *ip; ip = (struct ip *)ip_ptr; ip->ip_v = IPVERSION; ip->ip_hl = 5; ip->ip_tos = inp->inp_ip_tos; ip->ip_len = 0; ip->ip_id = 0; ip->ip_off = 0; ip->ip_ttl = inp->inp_ip_ttl; ip->ip_sum = 0; ip->ip_p = IPPROTO_TCP; ip->ip_src = inp->inp_laddr; ip->ip_dst = inp->inp_faddr; } #endif /* INET */ th->th_sport = inp->inp_lport; th->th_dport = inp->inp_fport; th->th_seq = 0; th->th_ack = 0; th->th_x2 = 0; th->th_off = 5; th->th_flags = 0; th->th_win = 0; th->th_urp = 0; th->th_sum = 0; /* in_pseudo() is called later for ipv4 */ } /* * Create template to be used to send tcp packets on a connection. * Allocates an mbuf and fills in a skeletal tcp/ip header. The only * use for this function is in keepalives, which use tcp_respond. */ struct tcptemp * tcpip_maketemplate(struct inpcb *inp) { struct tcptemp *t; t = malloc(sizeof(*t), M_TEMP, M_NOWAIT); if (t == NULL) return (NULL); tcpip_fillheaders(inp, (void *)&t->tt_ipgen, (void *)&t->tt_t); return (t); } /* * Send a single message to the TCP at address specified by * the given TCP/IP header. If m == NULL, then we make a copy * of the tcpiphdr at th and send directly to the addressed host. * This is used to force keep alive messages out using the TCP * template for a connection. If flags are given then we send * a message back to the TCP which originated the segment th, * and discard the mbuf containing it and any other attached mbufs. * * In any case the ack and sequence number of the transmitted * segment are as specified by the parameters. * * NOTE: If m != NULL, then th must point to *inside* the mbuf. */ void tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m, tcp_seq ack, tcp_seq seq, int flags) { struct tcpopt to; struct inpcb *inp; struct ip *ip; struct mbuf *optm; struct tcphdr *nth; u_char *optp; #ifdef INET6 struct ip6_hdr *ip6; int isipv6; #endif /* INET6 */ int optlen, tlen, win; bool incl_opts; KASSERT(tp != NULL || m != NULL, ("tcp_respond: tp and m both NULL")); #ifdef INET6 isipv6 = ((struct ip *)ipgen)->ip_v == (IPV6_VERSION >> 4); ip6 = ipgen; #endif /* INET6 */ ip = ipgen; if (tp != NULL) { inp = tp->t_inpcb; KASSERT(inp != NULL, ("tcp control block w/o inpcb")); INP_WLOCK_ASSERT(inp); } else inp = NULL; incl_opts = false; win = 0; if (tp != NULL) { if (!(flags & TH_RST)) { win = sbspace(&inp->inp_socket->so_rcv); if (win > TCP_MAXWIN << tp->rcv_scale) win = TCP_MAXWIN << tp->rcv_scale; } if ((tp->t_flags & TF_NOOPT) == 0) incl_opts = true; } if (m == NULL) { m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) return; m->m_data += max_linkhdr; #ifdef INET6 if (isipv6) { bcopy((caddr_t)ip6, mtod(m, caddr_t), sizeof(struct ip6_hdr)); ip6 = mtod(m, struct ip6_hdr *); nth = (struct tcphdr *)(ip6 + 1); } else #endif /* INET6 */ { bcopy((caddr_t)ip, mtod(m, caddr_t), sizeof(struct ip)); ip = mtod(m, struct ip *); nth = (struct tcphdr *)(ip + 1); } bcopy((caddr_t)th, (caddr_t)nth, sizeof(struct tcphdr)); flags = TH_ACK; } else if (!M_WRITABLE(m)) { struct mbuf *n; /* Can't reuse 'm', allocate a new mbuf. */ n = m_gethdr(M_NOWAIT, MT_DATA); if (n == NULL) { m_freem(m); return; } if (!m_dup_pkthdr(n, m, M_NOWAIT)) { m_freem(m); m_freem(n); return; } n->m_data += max_linkhdr; /* m_len is set later */ #define xchg(a,b,type) { type t; t=a; a=b; b=t; } #ifdef INET6 if (isipv6) { bcopy((caddr_t)ip6, mtod(n, caddr_t), sizeof(struct ip6_hdr)); ip6 = mtod(n, struct ip6_hdr *); xchg(ip6->ip6_dst, ip6->ip6_src, struct in6_addr); nth = (struct tcphdr *)(ip6 + 1); } else #endif /* INET6 */ { bcopy((caddr_t)ip, mtod(n, caddr_t), sizeof(struct ip)); ip = mtod(n, struct ip *); xchg(ip->ip_dst.s_addr, ip->ip_src.s_addr, uint32_t); nth = (struct tcphdr *)(ip + 1); } bcopy((caddr_t)th, (caddr_t)nth, sizeof(struct tcphdr)); xchg(nth->th_dport, nth->th_sport, uint16_t); th = nth; m_freem(m); m = n; } else { /* * reuse the mbuf. * XXX MRT We inherit the FIB, which is lucky. */ m_freem(m->m_next); m->m_next = NULL; m->m_data = (caddr_t)ipgen; /* m_len is set later */ #ifdef INET6 if (isipv6) { xchg(ip6->ip6_dst, ip6->ip6_src, struct in6_addr); nth = (struct tcphdr *)(ip6 + 1); } else #endif /* INET6 */ { xchg(ip->ip_dst.s_addr, ip->ip_src.s_addr, uint32_t); nth = (struct tcphdr *)(ip + 1); } if (th != nth) { /* * this is usually a case when an extension header * exists between the IPv6 header and the * TCP header. */ nth->th_sport = th->th_sport; nth->th_dport = th->th_dport; } xchg(nth->th_dport, nth->th_sport, uint16_t); #undef xchg } tlen = 0; #ifdef INET6 if (isipv6) tlen = sizeof (struct ip6_hdr) + sizeof (struct tcphdr); #endif #if defined(INET) && defined(INET6) else #endif #ifdef INET tlen = sizeof (struct tcpiphdr); #endif #ifdef INVARIANTS m->m_len = 0; KASSERT(M_TRAILINGSPACE(m) >= tlen, ("Not enough trailing space for message (m=%p, need=%d, have=%ld)", m, tlen, (long)M_TRAILINGSPACE(m))); #endif m->m_len = tlen; to.to_flags = 0; if (incl_opts) { /* Make sure we have room. */ if (M_TRAILINGSPACE(m) < TCP_MAXOLEN) { m->m_next = m_get(M_NOWAIT, MT_DATA); if (m->m_next) { optp = mtod(m->m_next, u_char *); optm = m->m_next; } else incl_opts = false; } else { optp = (u_char *) (nth + 1); optm = m; } } if (incl_opts) { /* Timestamps. */ if (tp->t_flags & TF_RCVD_TSTMP) { to.to_tsval = tcp_ts_getticks() + tp->ts_offset; to.to_tsecr = tp->ts_recent; to.to_flags |= TOF_TS; } #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) /* TCP-MD5 (RFC2385). */ if (tp->t_flags & TF_SIGNATURE) to.to_flags |= TOF_SIGNATURE; #endif /* Add the options. */ tlen += optlen = tcp_addoptions(&to, optp); /* Update m_len in the correct mbuf. */ optm->m_len += optlen; } else optlen = 0; #ifdef INET6 if (isipv6) { ip6->ip6_flow = 0; ip6->ip6_vfc = IPV6_VERSION; ip6->ip6_nxt = IPPROTO_TCP; ip6->ip6_plen = htons(tlen - sizeof(*ip6)); } #endif #if defined(INET) && defined(INET6) else #endif #ifdef INET { ip->ip_len = htons(tlen); ip->ip_ttl = V_ip_defttl; if (V_path_mtu_discovery) ip->ip_off |= htons(IP_DF); } #endif m->m_pkthdr.len = tlen; m->m_pkthdr.rcvif = NULL; #ifdef MAC if (inp != NULL) { /* * Packet is associated with a socket, so allow the * label of the response to reflect the socket label. */ INP_WLOCK_ASSERT(inp); mac_inpcb_create_mbuf(inp, m); } else { /* * Packet is not associated with a socket, so possibly * update the label in place. */ mac_netinet_tcp_reply(m); } #endif nth->th_seq = htonl(seq); nth->th_ack = htonl(ack); nth->th_x2 = 0; nth->th_off = (sizeof (struct tcphdr) + optlen) >> 2; nth->th_flags = flags; if (tp != NULL) nth->th_win = htons((u_short) (win >> tp->rcv_scale)); else nth->th_win = htons((u_short)win); nth->th_urp = 0; #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) if (to.to_flags & TOF_SIGNATURE) { if (!TCPMD5_ENABLED() || TCPMD5_OUTPUT(m, nth, to.to_signature) != 0) { m_freem(m); return; } } #endif m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); #ifdef INET6 if (isipv6) { m->m_pkthdr.csum_flags = CSUM_TCP_IPV6; nth->th_sum = in6_cksum_pseudo(ip6, tlen - sizeof(struct ip6_hdr), IPPROTO_TCP, 0); ip6->ip6_hlim = in6_selecthlim(tp != NULL ? tp->t_inpcb : NULL, NULL); } #endif /* INET6 */ #if defined(INET6) && defined(INET) else #endif #ifdef INET { m->m_pkthdr.csum_flags = CSUM_TCP; nth->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htons((u_short)(tlen - sizeof(struct ip) + ip->ip_p))); } #endif /* INET */ #ifdef TCPDEBUG if (tp == NULL || (inp->inp_socket->so_options & SO_DEBUG)) tcp_trace(TA_OUTPUT, 0, tp, mtod(m, void *), th, 0); #endif TCP_PROBE3(debug__output, tp, th, m); if (flags & TH_RST) TCP_PROBE5(accept__refused, NULL, NULL, m, tp, nth); TCP_PROBE5(send, NULL, tp, m, tp, nth); #ifdef INET6 if (isipv6) (void) ip6_output(m, NULL, NULL, 0, NULL, NULL, inp); #endif /* INET6 */ #if defined(INET) && defined(INET6) else #endif #ifdef INET (void) ip_output(m, NULL, NULL, 0, NULL, inp); #endif } /* * Create a new TCP control block, making an * empty reassembly queue and hooking it to the argument * protocol control block. The `inp' parameter must have * come from the zone allocator set up in tcp_init(). */ struct tcpcb * tcp_newtcpcb(struct inpcb *inp) { struct tcpcb_mem *tm; struct tcpcb *tp; #ifdef INET6 int isipv6 = (inp->inp_vflag & INP_IPV6) != 0; #endif /* INET6 */ tm = uma_zalloc(V_tcpcb_zone, M_NOWAIT | M_ZERO); if (tm == NULL) return (NULL); tp = &tm->tcb; /* Initialise cc_var struct for this tcpcb. */ tp->ccv = &tm->ccv; tp->ccv->type = IPPROTO_TCP; tp->ccv->ccvc.tcp = tp; rw_rlock(&tcp_function_lock); tp->t_fb = tcp_func_set_ptr; refcount_acquire(&tp->t_fb->tfb_refcnt); rw_runlock(&tcp_function_lock); /* * Use the current system default CC algorithm. */ CC_LIST_RLOCK(); KASSERT(!STAILQ_EMPTY(&cc_list), ("cc_list is empty!")); CC_ALGO(tp) = CC_DEFAULT(); CC_LIST_RUNLOCK(); if (CC_ALGO(tp)->cb_init != NULL) if (CC_ALGO(tp)->cb_init(tp->ccv) > 0) { if (tp->t_fb->tfb_tcp_fb_fini) (*tp->t_fb->tfb_tcp_fb_fini)(tp, 1); refcount_release(&tp->t_fb->tfb_refcnt); uma_zfree(V_tcpcb_zone, tm); return (NULL); } #ifdef TCP_HHOOK tp->osd = &tm->osd; if (khelp_init_osd(HELPER_CLASS_TCP, tp->osd)) { if (tp->t_fb->tfb_tcp_fb_fini) (*tp->t_fb->tfb_tcp_fb_fini)(tp, 1); refcount_release(&tp->t_fb->tfb_refcnt); uma_zfree(V_tcpcb_zone, tm); return (NULL); } #endif #ifdef VIMAGE tp->t_vnet = inp->inp_vnet; #endif tp->t_timers = &tm->tt; /* LIST_INIT(&tp->t_segq); */ /* XXX covered by M_ZERO */ tp->t_maxseg = #ifdef INET6 isipv6 ? V_tcp_v6mssdflt : #endif /* INET6 */ V_tcp_mssdflt; /* Set up our timeouts. */ callout_init(&tp->t_timers->tt_rexmt, 1); callout_init(&tp->t_timers->tt_persist, 1); callout_init(&tp->t_timers->tt_keep, 1); callout_init(&tp->t_timers->tt_2msl, 1); callout_init(&tp->t_timers->tt_delack, 1); if (V_tcp_do_rfc1323) tp->t_flags = (TF_REQ_SCALE|TF_REQ_TSTMP); if (V_tcp_do_sack) tp->t_flags |= TF_SACK_PERMIT; TAILQ_INIT(&tp->snd_holes); /* * The tcpcb will hold a reference on its inpcb until tcp_discardcb() * is called. */ in_pcbref(inp); /* Reference for tcpcb */ tp->t_inpcb = inp; /* * Init srtt to TCPTV_SRTTBASE (0), so we can tell that we have no * rtt estimate. Set rttvar so that srtt + 4 * rttvar gives * reasonable initial retransmit time. */ tp->t_srtt = TCPTV_SRTTBASE; tp->t_rttvar = ((TCPTV_RTOBASE - TCPTV_SRTTBASE) << TCP_RTTVAR_SHIFT) / 4; tp->t_rttmin = tcp_rexmit_min; tp->t_rxtcur = TCPTV_RTOBASE; tp->snd_cwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT; tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT; tp->t_rcvtime = ticks; /* * IPv4 TTL initialization is necessary for an IPv6 socket as well, * because the socket may be bound to an IPv6 wildcard address, * which may match an IPv4-mapped IPv6 address. */ inp->inp_ip_ttl = V_ip_defttl; inp->inp_ppcb = tp; #ifdef TCPPCAP /* * Init the TCP PCAP queues. */ tcp_pcap_tcpcb_init(tp); #endif if (tp->t_fb->tfb_tcp_fb_init) { (*tp->t_fb->tfb_tcp_fb_init)(tp); } return (tp); /* XXX */ } /* * Switch the congestion control algorithm back to NewReno for any active * control blocks using an algorithm which is about to go away. * This ensures the CC framework can allow the unload to proceed without leaving * any dangling pointers which would trigger a panic. * Returning non-zero would inform the CC framework that something went wrong * and it would be unsafe to allow the unload to proceed. However, there is no * way for this to occur with this implementation so we always return zero. */ int tcp_ccalgounload(struct cc_algo *unload_algo) { struct cc_algo *tmpalgo; struct inpcb *inp; struct tcpcb *tp; VNET_ITERATOR_DECL(vnet_iter); /* * Check all active control blocks across all network stacks and change * any that are using "unload_algo" back to NewReno. If "unload_algo" * requires cleanup code to be run, call it. */ VNET_LIST_RLOCK(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); INP_INFO_WLOCK(&V_tcbinfo); /* * New connections already part way through being initialised * with the CC algo we're removing will not race with this code * because the INP_INFO_WLOCK is held during initialisation. We * therefore don't enter the loop below until the connection * list has stabilised. */ LIST_FOREACH(inp, &V_tcb, inp_list) { INP_WLOCK(inp); /* Important to skip tcptw structs. */ if (!(inp->inp_flags & INP_TIMEWAIT) && (tp = intotcpcb(inp)) != NULL) { /* * By holding INP_WLOCK here, we are assured * that the connection is not currently * executing inside the CC module's functions * i.e. it is safe to make the switch back to * NewReno. */ if (CC_ALGO(tp) == unload_algo) { tmpalgo = CC_ALGO(tp); /* NewReno does not require any init. */ CC_ALGO(tp) = &newreno_cc_algo; if (tmpalgo->cb_destroy != NULL) tmpalgo->cb_destroy(tp->ccv); } } INP_WUNLOCK(inp); } INP_INFO_WUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); } VNET_LIST_RUNLOCK(); return (0); } /* * Drop a TCP connection, reporting * the specified error. If connection is synchronized, * then send a RST to peer. */ struct tcpcb * tcp_drop(struct tcpcb *tp, int errno) { struct socket *so = tp->t_inpcb->inp_socket; INP_INFO_LOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(tp->t_inpcb); if (TCPS_HAVERCVDSYN(tp->t_state)) { tcp_state_change(tp, TCPS_CLOSED); (void) tp->t_fb->tfb_tcp_output(tp); TCPSTAT_INC(tcps_drops); } else TCPSTAT_INC(tcps_conndrops); if (errno == ETIMEDOUT && tp->t_softerror) errno = tp->t_softerror; so->so_error = errno; return (tcp_close(tp)); } void tcp_discardcb(struct tcpcb *tp) { struct inpcb *inp = tp->t_inpcb; struct socket *so = inp->inp_socket; #ifdef INET6 int isipv6 = (inp->inp_vflag & INP_IPV6) != 0; #endif /* INET6 */ int released; INP_WLOCK_ASSERT(inp); /* * Make sure that all of our timers are stopped before we delete the * PCB. * * If stopping a timer fails, we schedule a discard function in same * callout, and the last discard function called will take care of * deleting the tcpcb. */ tp->t_timers->tt_draincnt = 0; tcp_timer_stop(tp, TT_REXMT); tcp_timer_stop(tp, TT_PERSIST); tcp_timer_stop(tp, TT_KEEP); tcp_timer_stop(tp, TT_2MSL); tcp_timer_stop(tp, TT_DELACK); if (tp->t_fb->tfb_tcp_timer_stop_all) { /* * Call the stop-all function of the methods, * this function should call the tcp_timer_stop() * method with each of the function specific timeouts. * That stop will be called via the tfb_tcp_timer_stop() * which should use the async drain function of the * callout system (see tcp_var.h). */ tp->t_fb->tfb_tcp_timer_stop_all(tp); } /* * If we got enough samples through the srtt filter, * save the rtt and rttvar in the routing entry. * 'Enough' is arbitrarily defined as 4 rtt samples. * 4 samples is enough for the srtt filter to converge * to within enough % of the correct value; fewer samples * and we could save a bogus rtt. The danger is not high * as tcp quickly recovers from everything. * XXX: Works very well but needs some more statistics! */ if (tp->t_rttupdated >= 4) { struct hc_metrics_lite metrics; uint32_t ssthresh; bzero(&metrics, sizeof(metrics)); /* * Update the ssthresh always when the conditions below * are satisfied. This gives us better new start value * for the congestion avoidance for new connections. * ssthresh is only set if packet loss occurred on a session. * * XXXRW: 'so' may be NULL here, and/or socket buffer may be * being torn down. Ideally this code would not use 'so'. */ ssthresh = tp->snd_ssthresh; if (ssthresh != 0 && ssthresh < so->so_snd.sb_hiwat / 2) { /* * convert the limit from user data bytes to * packets then to packet data bytes. */ ssthresh = (ssthresh + tp->t_maxseg / 2) / tp->t_maxseg; if (ssthresh < 2) ssthresh = 2; ssthresh *= (tp->t_maxseg + #ifdef INET6 (isipv6 ? sizeof (struct ip6_hdr) + sizeof (struct tcphdr) : #endif sizeof (struct tcpiphdr) #ifdef INET6 ) #endif ); } else ssthresh = 0; metrics.rmx_ssthresh = ssthresh; metrics.rmx_rtt = tp->t_srtt; metrics.rmx_rttvar = tp->t_rttvar; metrics.rmx_cwnd = tp->snd_cwnd; metrics.rmx_sendpipe = 0; metrics.rmx_recvpipe = 0; tcp_hc_update(&inp->inp_inc, &metrics); } /* free the reassembly queue, if any */ tcp_reass_flush(tp); #ifdef TCP_OFFLOAD /* Disconnect offload device, if any. */ if (tp->t_flags & TF_TOE) tcp_offload_detach(tp); #endif tcp_free_sackholes(tp); #ifdef TCPPCAP /* Free the TCP PCAP queues. */ tcp_pcap_drain(&(tp->t_inpkts)); tcp_pcap_drain(&(tp->t_outpkts)); #endif /* Allow the CC algorithm to clean up after itself. */ if (CC_ALGO(tp)->cb_destroy != NULL) CC_ALGO(tp)->cb_destroy(tp->ccv); #ifdef TCP_HHOOK khelp_destroy_osd(tp->osd); #endif CC_ALGO(tp) = NULL; inp->inp_ppcb = NULL; if (tp->t_timers->tt_draincnt == 0) { /* We own the last reference on tcpcb, let's free it. */ TCPSTATES_DEC(tp->t_state); if (tp->t_fb->tfb_tcp_fb_fini) (*tp->t_fb->tfb_tcp_fb_fini)(tp, 1); refcount_release(&tp->t_fb->tfb_refcnt); tp->t_inpcb = NULL; uma_zfree(V_tcpcb_zone, tp); released = in_pcbrele_wlocked(inp); KASSERT(!released, ("%s: inp %p should not have been released " "here", __func__, inp)); } } void tcp_timer_discard(void *ptp) { struct inpcb *inp; struct tcpcb *tp; tp = (struct tcpcb *)ptp; CURVNET_SET(tp->t_vnet); INP_INFO_RLOCK(&V_tcbinfo); inp = tp->t_inpcb; KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); INP_WLOCK(inp); KASSERT((tp->t_timers->tt_flags & TT_STOPPED) != 0, ("%s: tcpcb has to be stopped here", __func__)); tp->t_timers->tt_draincnt--; if (tp->t_timers->tt_draincnt == 0) { /* We own the last reference on this tcpcb, let's free it. */ TCPSTATES_DEC(tp->t_state); if (tp->t_fb->tfb_tcp_fb_fini) (*tp->t_fb->tfb_tcp_fb_fini)(tp, 1); refcount_release(&tp->t_fb->tfb_refcnt); tp->t_inpcb = NULL; uma_zfree(V_tcpcb_zone, tp); if (in_pcbrele_wlocked(inp)) { INP_INFO_RUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); return; } } INP_WUNLOCK(inp); INP_INFO_RUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); } /* * Attempt to close a TCP control block, marking it as dropped, and freeing * the socket if we hold the only reference. */ struct tcpcb * tcp_close(struct tcpcb *tp) { struct inpcb *inp = tp->t_inpcb; struct socket *so; INP_INFO_LOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(inp); #ifdef TCP_OFFLOAD if (tp->t_state == TCPS_LISTEN) tcp_offload_listen_stop(tp); #endif #ifdef TCP_RFC7413 /* * This releases the TFO pending counter resource for TFO listen * sockets as well as passively-created TFO sockets that transition * from SYN_RECEIVED to CLOSED. */ if (tp->t_tfo_pending) { tcp_fastopen_decrement_counter(tp->t_tfo_pending); tp->t_tfo_pending = NULL; } #endif in_pcbdrop(inp); TCPSTAT_INC(tcps_closed); if (tp->t_state != TCPS_CLOSED) tcp_state_change(tp, TCPS_CLOSED); KASSERT(inp->inp_socket != NULL, ("tcp_close: inp_socket NULL")); so = inp->inp_socket; soisdisconnected(so); if (inp->inp_flags & INP_SOCKREF) { KASSERT(so->so_state & SS_PROTOREF, ("tcp_close: !SS_PROTOREF")); inp->inp_flags &= ~INP_SOCKREF; INP_WUNLOCK(inp); - ACCEPT_LOCK(); SOCK_LOCK(so); so->so_state &= ~SS_PROTOREF; sofree(so); return (NULL); } return (tp); } void tcp_drain(void) { VNET_ITERATOR_DECL(vnet_iter); if (!do_tcpdrain) return; VNET_LIST_RLOCK_NOSLEEP(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); struct inpcb *inpb; struct tcpcb *tcpb; /* * Walk the tcpbs, if existing, and flush the reassembly queue, * if there is one... * XXX: The "Net/3" implementation doesn't imply that the TCP * reassembly queue should be flushed, but in a situation * where we're really low on mbufs, this is potentially * useful. */ INP_INFO_WLOCK(&V_tcbinfo); LIST_FOREACH(inpb, V_tcbinfo.ipi_listhead, inp_list) { if (inpb->inp_flags & INP_TIMEWAIT) continue; INP_WLOCK(inpb); if ((tcpb = intotcpcb(inpb)) != NULL) { tcp_reass_flush(tcpb); tcp_clean_sackreport(tcpb); #ifdef TCPPCAP if (tcp_pcap_aggressive_free) { /* Free the TCP PCAP queues. */ tcp_pcap_drain(&(tcpb->t_inpkts)); tcp_pcap_drain(&(tcpb->t_outpkts)); } #endif } INP_WUNLOCK(inpb); } INP_INFO_WUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); } VNET_LIST_RUNLOCK_NOSLEEP(); } /* * Notify a tcp user of an asynchronous error; * store error as soft error, but wake up user * (for now, won't do anything until can select for soft error). * * Do not wake up user since there currently is no mechanism for * reporting soft errors (yet - a kqueue filter may be added). */ static struct inpcb * tcp_notify(struct inpcb *inp, int error) { struct tcpcb *tp; INP_INFO_LOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(inp); if ((inp->inp_flags & INP_TIMEWAIT) || (inp->inp_flags & INP_DROPPED)) return (inp); tp = intotcpcb(inp); KASSERT(tp != NULL, ("tcp_notify: tp == NULL")); /* * Ignore some errors if we are hooked up. * If connection hasn't completed, has retransmitted several times, * and receives a second error, give up now. This is better * than waiting a long time to establish a connection that * can never complete. */ if (tp->t_state == TCPS_ESTABLISHED && (error == EHOSTUNREACH || error == ENETUNREACH || error == EHOSTDOWN)) { if (inp->inp_route.ro_rt) { RTFREE(inp->inp_route.ro_rt); inp->inp_route.ro_rt = (struct rtentry *)NULL; } return (inp); } else if (tp->t_state < TCPS_ESTABLISHED && tp->t_rxtshift > 3 && tp->t_softerror) { tp = tcp_drop(tp, error); if (tp != NULL) return (inp); else return (NULL); } else { tp->t_softerror = error; return (inp); } #if 0 wakeup( &so->so_timeo); sorwakeup(so); sowwakeup(so); #endif } static int tcp_pcblist(SYSCTL_HANDLER_ARGS) { int error, i, m, n, pcb_count; struct inpcb *inp, **inp_list; inp_gen_t gencnt; struct xinpgen xig; /* * The process of preparing the TCB list is too time-consuming and * resource-intensive to repeat twice on every request. */ if (req->oldptr == NULL) { n = V_tcbinfo.ipi_count + counter_u64_fetch(V_tcps_states[TCPS_SYN_RECEIVED]); n += imax(n / 8, 10); req->oldidx = 2 * (sizeof xig) + n * sizeof(struct xtcpcb); return (0); } if (req->newptr != NULL) return (EPERM); /* * OK, now we're committed to doing something. */ INP_LIST_RLOCK(&V_tcbinfo); gencnt = V_tcbinfo.ipi_gencnt; n = V_tcbinfo.ipi_count; INP_LIST_RUNLOCK(&V_tcbinfo); m = counter_u64_fetch(V_tcps_states[TCPS_SYN_RECEIVED]); error = sysctl_wire_old_buffer(req, 2 * (sizeof xig) + (n + m) * sizeof(struct xtcpcb)); if (error != 0) return (error); xig.xig_len = sizeof xig; xig.xig_count = n + m; xig.xig_gen = gencnt; xig.xig_sogen = so_gencnt; error = SYSCTL_OUT(req, &xig, sizeof xig); if (error) return (error); error = syncache_pcblist(req, m, &pcb_count); if (error) return (error); inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK); INP_INFO_WLOCK(&V_tcbinfo); for (inp = LIST_FIRST(V_tcbinfo.ipi_listhead), i = 0; inp != NULL && i < n; inp = LIST_NEXT(inp, inp_list)) { INP_WLOCK(inp); if (inp->inp_gencnt <= gencnt) { /* * XXX: This use of cr_cansee(), introduced with * TCP state changes, is not quite right, but for * now, better than nothing. */ if (inp->inp_flags & INP_TIMEWAIT) { if (intotw(inp) != NULL) error = cr_cansee(req->td->td_ucred, intotw(inp)->tw_cred); else error = EINVAL; /* Skip this inp. */ } else error = cr_canseeinpcb(req->td->td_ucred, inp); if (error == 0) { in_pcbref(inp); inp_list[i++] = inp; } } INP_WUNLOCK(inp); } INP_INFO_WUNLOCK(&V_tcbinfo); n = i; error = 0; for (i = 0; i < n; i++) { inp = inp_list[i]; INP_RLOCK(inp); if (inp->inp_gencnt <= gencnt) { struct xtcpcb xt; tcp_inptoxtp(inp, &xt); INP_RUNLOCK(inp); error = SYSCTL_OUT(req, &xt, sizeof xt); } else INP_RUNLOCK(inp); } INP_INFO_RLOCK(&V_tcbinfo); for (i = 0; i < n; i++) { inp = inp_list[i]; INP_RLOCK(inp); if (!in_pcbrele_rlocked(inp)) INP_RUNLOCK(inp); } INP_INFO_RUNLOCK(&V_tcbinfo); if (!error) { /* * Give the user an updated idea of our state. * If the generation differs from what we told * her before, she knows that something happened * while we were processing this request, and it * might be necessary to retry. */ INP_LIST_RLOCK(&V_tcbinfo); xig.xig_gen = V_tcbinfo.ipi_gencnt; xig.xig_sogen = so_gencnt; xig.xig_count = V_tcbinfo.ipi_count + pcb_count; INP_LIST_RUNLOCK(&V_tcbinfo); error = SYSCTL_OUT(req, &xig, sizeof xig); } free(inp_list, M_TEMP); return (error); } SYSCTL_PROC(_net_inet_tcp, TCPCTL_PCBLIST, pcblist, CTLTYPE_OPAQUE | CTLFLAG_RD, NULL, 0, tcp_pcblist, "S,xtcpcb", "List of active TCP connections"); #ifdef INET static int tcp_getcred(SYSCTL_HANDLER_ARGS) { struct xucred xuc; struct sockaddr_in addrs[2]; struct inpcb *inp; int error; error = priv_check(req->td, PRIV_NETINET_GETCRED); if (error) return (error); error = SYSCTL_IN(req, addrs, sizeof(addrs)); if (error) return (error); inp = in_pcblookup(&V_tcbinfo, addrs[1].sin_addr, addrs[1].sin_port, addrs[0].sin_addr, addrs[0].sin_port, INPLOOKUP_RLOCKPCB, NULL); if (inp != NULL) { if (inp->inp_socket == NULL) error = ENOENT; if (error == 0) error = cr_canseeinpcb(req->td->td_ucred, inp); if (error == 0) cru2x(inp->inp_cred, &xuc); INP_RUNLOCK(inp); } else error = ENOENT; if (error == 0) error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred)); return (error); } SYSCTL_PROC(_net_inet_tcp, OID_AUTO, getcred, CTLTYPE_OPAQUE|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0, tcp_getcred, "S,xucred", "Get the xucred of a TCP connection"); #endif /* INET */ #ifdef INET6 static int tcp6_getcred(SYSCTL_HANDLER_ARGS) { struct xucred xuc; struct sockaddr_in6 addrs[2]; struct inpcb *inp; int error; #ifdef INET int mapped = 0; #endif error = priv_check(req->td, PRIV_NETINET_GETCRED); if (error) return (error); error = SYSCTL_IN(req, addrs, sizeof(addrs)); if (error) return (error); if ((error = sa6_embedscope(&addrs[0], V_ip6_use_defzone)) != 0 || (error = sa6_embedscope(&addrs[1], V_ip6_use_defzone)) != 0) { return (error); } if (IN6_IS_ADDR_V4MAPPED(&addrs[0].sin6_addr)) { #ifdef INET if (IN6_IS_ADDR_V4MAPPED(&addrs[1].sin6_addr)) mapped = 1; else #endif return (EINVAL); } #ifdef INET if (mapped == 1) inp = in_pcblookup(&V_tcbinfo, *(struct in_addr *)&addrs[1].sin6_addr.s6_addr[12], addrs[1].sin6_port, *(struct in_addr *)&addrs[0].sin6_addr.s6_addr[12], addrs[0].sin6_port, INPLOOKUP_RLOCKPCB, NULL); else #endif inp = in6_pcblookup(&V_tcbinfo, &addrs[1].sin6_addr, addrs[1].sin6_port, &addrs[0].sin6_addr, addrs[0].sin6_port, INPLOOKUP_RLOCKPCB, NULL); if (inp != NULL) { if (inp->inp_socket == NULL) error = ENOENT; if (error == 0) error = cr_canseeinpcb(req->td->td_ucred, inp); if (error == 0) cru2x(inp->inp_cred, &xuc); INP_RUNLOCK(inp); } else error = ENOENT; if (error == 0) error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred)); return (error); } SYSCTL_PROC(_net_inet6_tcp6, OID_AUTO, getcred, CTLTYPE_OPAQUE|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0, tcp6_getcred, "S,xucred", "Get the xucred of a TCP6 connection"); #endif /* INET6 */ #ifdef INET void tcp_ctlinput(int cmd, struct sockaddr *sa, void *vip) { struct ip *ip = vip; struct tcphdr *th; struct in_addr faddr; struct inpcb *inp; struct tcpcb *tp; struct inpcb *(*notify)(struct inpcb *, int) = tcp_notify; struct icmp *icp; struct in_conninfo inc; tcp_seq icmp_tcp_seq; int mtu; faddr = ((struct sockaddr_in *)sa)->sin_addr; if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY) return; if (cmd == PRC_MSGSIZE) notify = tcp_mtudisc_notify; else if (V_icmp_may_rst && (cmd == PRC_UNREACH_ADMIN_PROHIB || cmd == PRC_UNREACH_PORT || cmd == PRC_UNREACH_PROTOCOL || cmd == PRC_TIMXCEED_INTRANS) && ip) notify = tcp_drop_syn_sent; /* * Hostdead is ugly because it goes linearly through all PCBs. * XXX: We never get this from ICMP, otherwise it makes an * excellent DoS attack on machines with many connections. */ else if (cmd == PRC_HOSTDEAD) ip = NULL; else if ((unsigned)cmd >= PRC_NCMDS || inetctlerrmap[cmd] == 0) return; if (ip == NULL) { in_pcbnotifyall(&V_tcbinfo, faddr, inetctlerrmap[cmd], notify); return; } icp = (struct icmp *)((caddr_t)ip - offsetof(struct icmp, icmp_ip)); th = (struct tcphdr *)((caddr_t)ip + (ip->ip_hl << 2)); INP_INFO_RLOCK(&V_tcbinfo); inp = in_pcblookup(&V_tcbinfo, faddr, th->th_dport, ip->ip_src, th->th_sport, INPLOOKUP_WLOCKPCB, NULL); if (inp != NULL && PRC_IS_REDIRECT(cmd)) { /* signal EHOSTDOWN, as it flushes the cached route */ inp = (*notify)(inp, EHOSTDOWN); goto out; } icmp_tcp_seq = th->th_seq; if (inp != NULL) { if (!(inp->inp_flags & INP_TIMEWAIT) && !(inp->inp_flags & INP_DROPPED) && !(inp->inp_socket == NULL)) { tp = intotcpcb(inp); if (SEQ_GEQ(ntohl(icmp_tcp_seq), tp->snd_una) && SEQ_LT(ntohl(icmp_tcp_seq), tp->snd_max)) { if (cmd == PRC_MSGSIZE) { /* * MTU discovery: * If we got a needfrag set the MTU * in the route to the suggested new * value (if given) and then notify. */ mtu = ntohs(icp->icmp_nextmtu); /* * If no alternative MTU was * proposed, try the next smaller * one. */ if (!mtu) mtu = ip_next_mtu( ntohs(ip->ip_len), 1); if (mtu < V_tcp_minmss + sizeof(struct tcpiphdr)) mtu = V_tcp_minmss + sizeof(struct tcpiphdr); /* * Only process the offered MTU if it * is smaller than the current one. */ if (mtu < tp->t_maxseg + sizeof(struct tcpiphdr)) { bzero(&inc, sizeof(inc)); inc.inc_faddr = faddr; inc.inc_fibnum = inp->inp_inc.inc_fibnum; tcp_hc_updatemtu(&inc, mtu); tcp_mtudisc(inp, mtu); } } else inp = (*notify)(inp, inetctlerrmap[cmd]); } } } else { bzero(&inc, sizeof(inc)); inc.inc_fport = th->th_dport; inc.inc_lport = th->th_sport; inc.inc_faddr = faddr; inc.inc_laddr = ip->ip_src; syncache_unreach(&inc, icmp_tcp_seq); } out: if (inp != NULL) INP_WUNLOCK(inp); INP_INFO_RUNLOCK(&V_tcbinfo); } #endif /* INET */ #ifdef INET6 void tcp6_ctlinput(int cmd, struct sockaddr *sa, void *d) { struct in6_addr *dst; struct inpcb *(*notify)(struct inpcb *, int) = tcp_notify; struct ip6_hdr *ip6; struct mbuf *m; struct inpcb *inp; struct tcpcb *tp; struct icmp6_hdr *icmp6; struct ip6ctlparam *ip6cp = NULL; const struct sockaddr_in6 *sa6_src = NULL; struct in_conninfo inc; struct tcp_ports { uint16_t th_sport; uint16_t th_dport; } t_ports; tcp_seq icmp_tcp_seq; unsigned int mtu; unsigned int off; if (sa->sa_family != AF_INET6 || sa->sa_len != sizeof(struct sockaddr_in6)) return; /* if the parameter is from icmp6, decode it. */ if (d != NULL) { ip6cp = (struct ip6ctlparam *)d; icmp6 = ip6cp->ip6c_icmp6; m = ip6cp->ip6c_m; ip6 = ip6cp->ip6c_ip6; off = ip6cp->ip6c_off; sa6_src = ip6cp->ip6c_src; dst = ip6cp->ip6c_finaldst; } else { m = NULL; ip6 = NULL; off = 0; /* fool gcc */ sa6_src = &sa6_any; dst = NULL; } if (cmd == PRC_MSGSIZE) notify = tcp_mtudisc_notify; else if (V_icmp_may_rst && (cmd == PRC_UNREACH_ADMIN_PROHIB || cmd == PRC_UNREACH_PORT || cmd == PRC_UNREACH_PROTOCOL || cmd == PRC_TIMXCEED_INTRANS) && ip6 != NULL) notify = tcp_drop_syn_sent; /* * Hostdead is ugly because it goes linearly through all PCBs. * XXX: We never get this from ICMP, otherwise it makes an * excellent DoS attack on machines with many connections. */ else if (cmd == PRC_HOSTDEAD) ip6 = NULL; else if ((unsigned)cmd >= PRC_NCMDS || inet6ctlerrmap[cmd] == 0) return; if (ip6 == NULL) { in6_pcbnotify(&V_tcbinfo, sa, 0, (const struct sockaddr *)sa6_src, 0, cmd, NULL, notify); return; } /* Check if we can safely get the ports from the tcp hdr */ if (m == NULL || (m->m_pkthdr.len < (int32_t) (off + sizeof(struct tcp_ports)))) { return; } bzero(&t_ports, sizeof(struct tcp_ports)); m_copydata(m, off, sizeof(struct tcp_ports), (caddr_t)&t_ports); INP_INFO_RLOCK(&V_tcbinfo); inp = in6_pcblookup(&V_tcbinfo, &ip6->ip6_dst, t_ports.th_dport, &ip6->ip6_src, t_ports.th_sport, INPLOOKUP_WLOCKPCB, NULL); if (inp != NULL && PRC_IS_REDIRECT(cmd)) { /* signal EHOSTDOWN, as it flushes the cached route */ inp = (*notify)(inp, EHOSTDOWN); goto out; } off += sizeof(struct tcp_ports); if (m->m_pkthdr.len < (int32_t) (off + sizeof(tcp_seq))) { goto out; } m_copydata(m, off, sizeof(tcp_seq), (caddr_t)&icmp_tcp_seq); if (inp != NULL) { if (!(inp->inp_flags & INP_TIMEWAIT) && !(inp->inp_flags & INP_DROPPED) && !(inp->inp_socket == NULL)) { tp = intotcpcb(inp); if (SEQ_GEQ(ntohl(icmp_tcp_seq), tp->snd_una) && SEQ_LT(ntohl(icmp_tcp_seq), tp->snd_max)) { if (cmd == PRC_MSGSIZE) { /* * MTU discovery: * If we got a needfrag set the MTU * in the route to the suggested new * value (if given) and then notify. */ mtu = ntohl(icmp6->icmp6_mtu); /* * If no alternative MTU was * proposed, or the proposed * MTU was too small, set to * the min. */ if (mtu < IPV6_MMTU) mtu = IPV6_MMTU - 8; bzero(&inc, sizeof(inc)); inc.inc_fibnum = M_GETFIB(m); inc.inc_flags |= INC_ISIPV6; inc.inc6_faddr = *dst; if (in6_setscope(&inc.inc6_faddr, m->m_pkthdr.rcvif, NULL)) goto out; /* * Only process the offered MTU if it * is smaller than the current one. */ if (mtu < tp->t_maxseg + sizeof (struct tcphdr) + sizeof (struct ip6_hdr)) { tcp_hc_updatemtu(&inc, mtu); tcp_mtudisc(inp, mtu); ICMP6STAT_INC(icp6s_pmtuchg); } } else inp = (*notify)(inp, inet6ctlerrmap[cmd]); } } } else { bzero(&inc, sizeof(inc)); inc.inc_fibnum = M_GETFIB(m); inc.inc_flags |= INC_ISIPV6; inc.inc_fport = t_ports.th_dport; inc.inc_lport = t_ports.th_sport; inc.inc6_faddr = *dst; inc.inc6_laddr = ip6->ip6_src; syncache_unreach(&inc, icmp_tcp_seq); } out: if (inp != NULL) INP_WUNLOCK(inp); INP_INFO_RUNLOCK(&V_tcbinfo); } #endif /* INET6 */ /* * Following is where TCP initial sequence number generation occurs. * * There are two places where we must use initial sequence numbers: * 1. In SYN-ACK packets. * 2. In SYN packets. * * All ISNs for SYN-ACK packets are generated by the syncache. See * tcp_syncache.c for details. * * The ISNs in SYN packets must be monotonic; TIME_WAIT recycling * depends on this property. In addition, these ISNs should be * unguessable so as to prevent connection hijacking. To satisfy * the requirements of this situation, the algorithm outlined in * RFC 1948 is used, with only small modifications. * * Implementation details: * * Time is based off the system timer, and is corrected so that it * increases by one megabyte per second. This allows for proper * recycling on high speed LANs while still leaving over an hour * before rollover. * * As reading the *exact* system time is too expensive to be done * whenever setting up a TCP connection, we increment the time * offset in two ways. First, a small random positive increment * is added to isn_offset for each connection that is set up. * Second, the function tcp_isn_tick fires once per clock tick * and increments isn_offset as necessary so that sequence numbers * are incremented at approximately ISN_BYTES_PER_SECOND. The * random positive increments serve only to ensure that the same * exact sequence number is never sent out twice (as could otherwise * happen when a port is recycled in less than the system tick * interval.) * * net.inet.tcp.isn_reseed_interval controls the number of seconds * between seeding of isn_secret. This is normally set to zero, * as reseeding should not be necessary. * * Locking of the global variables isn_secret, isn_last_reseed, isn_offset, * isn_offset_old, and isn_ctx is performed using the TCP pcbinfo lock. In * general, this means holding an exclusive (write) lock. */ #define ISN_BYTES_PER_SECOND 1048576 #define ISN_STATIC_INCREMENT 4096 #define ISN_RANDOM_INCREMENT (4096 - 1) static VNET_DEFINE(u_char, isn_secret[32]); static VNET_DEFINE(int, isn_last); static VNET_DEFINE(int, isn_last_reseed); static VNET_DEFINE(u_int32_t, isn_offset); static VNET_DEFINE(u_int32_t, isn_offset_old); #define V_isn_secret VNET(isn_secret) #define V_isn_last VNET(isn_last) #define V_isn_last_reseed VNET(isn_last_reseed) #define V_isn_offset VNET(isn_offset) #define V_isn_offset_old VNET(isn_offset_old) tcp_seq tcp_new_isn(struct tcpcb *tp) { MD5_CTX isn_ctx; u_int32_t md5_buffer[4]; tcp_seq new_isn; u_int32_t projected_offset; INP_WLOCK_ASSERT(tp->t_inpcb); ISN_LOCK(); /* Seed if this is the first use, reseed if requested. */ if ((V_isn_last_reseed == 0) || ((V_tcp_isn_reseed_interval > 0) && (((u_int)V_isn_last_reseed + (u_int)V_tcp_isn_reseed_interval*hz) < (u_int)ticks))) { read_random(&V_isn_secret, sizeof(V_isn_secret)); V_isn_last_reseed = ticks; } /* Compute the md5 hash and return the ISN. */ MD5Init(&isn_ctx); MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_fport, sizeof(u_short)); MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_lport, sizeof(u_short)); #ifdef INET6 if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) { MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->in6p_faddr, sizeof(struct in6_addr)); MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->in6p_laddr, sizeof(struct in6_addr)); } else #endif { MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_faddr, sizeof(struct in_addr)); MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_laddr, sizeof(struct in_addr)); } MD5Update(&isn_ctx, (u_char *) &V_isn_secret, sizeof(V_isn_secret)); MD5Final((u_char *) &md5_buffer, &isn_ctx); new_isn = (tcp_seq) md5_buffer[0]; V_isn_offset += ISN_STATIC_INCREMENT + (arc4random() & ISN_RANDOM_INCREMENT); if (ticks != V_isn_last) { projected_offset = V_isn_offset_old + ISN_BYTES_PER_SECOND / hz * (ticks - V_isn_last); if (SEQ_GT(projected_offset, V_isn_offset)) V_isn_offset = projected_offset; V_isn_offset_old = V_isn_offset; V_isn_last = ticks; } new_isn += V_isn_offset; ISN_UNLOCK(); return (new_isn); } /* * When a specific ICMP unreachable message is received and the * connection state is SYN-SENT, drop the connection. This behavior * is controlled by the icmp_may_rst sysctl. */ struct inpcb * tcp_drop_syn_sent(struct inpcb *inp, int errno) { struct tcpcb *tp; INP_INFO_RLOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(inp); if ((inp->inp_flags & INP_TIMEWAIT) || (inp->inp_flags & INP_DROPPED)) return (inp); tp = intotcpcb(inp); if (tp->t_state != TCPS_SYN_SENT) return (inp); tp = tcp_drop(tp, errno); if (tp != NULL) return (inp); else return (NULL); } /* * When `need fragmentation' ICMP is received, update our idea of the MSS * based on the new value. Also nudge TCP to send something, since we * know the packet we just sent was dropped. * This duplicates some code in the tcp_mss() function in tcp_input.c. */ static struct inpcb * tcp_mtudisc_notify(struct inpcb *inp, int error) { tcp_mtudisc(inp, -1); return (inp); } static void tcp_mtudisc(struct inpcb *inp, int mtuoffer) { struct tcpcb *tp; struct socket *so; INP_WLOCK_ASSERT(inp); if ((inp->inp_flags & INP_TIMEWAIT) || (inp->inp_flags & INP_DROPPED)) return; tp = intotcpcb(inp); KASSERT(tp != NULL, ("tcp_mtudisc: tp == NULL")); tcp_mss_update(tp, -1, mtuoffer, NULL, NULL); so = inp->inp_socket; SOCKBUF_LOCK(&so->so_snd); /* If the mss is larger than the socket buffer, decrease the mss. */ if (so->so_snd.sb_hiwat < tp->t_maxseg) tp->t_maxseg = so->so_snd.sb_hiwat; SOCKBUF_UNLOCK(&so->so_snd); TCPSTAT_INC(tcps_mturesent); tp->t_rtttime = 0; tp->snd_nxt = tp->snd_una; tcp_free_sackholes(tp); tp->snd_recover = tp->snd_max; if (tp->t_flags & TF_SACK_PERMIT) EXIT_FASTRECOVERY(tp->t_flags); tp->t_fb->tfb_tcp_output(tp); } #ifdef INET /* * Look-up the routing entry to the peer of this inpcb. If no route * is found and it cannot be allocated, then return 0. This routine * is called by TCP routines that access the rmx structure and by * tcp_mss_update to get the peer/interface MTU. */ uint32_t tcp_maxmtu(struct in_conninfo *inc, struct tcp_ifcap *cap) { struct nhop4_extended nh4; struct ifnet *ifp; uint32_t maxmtu = 0; KASSERT(inc != NULL, ("tcp_maxmtu with NULL in_conninfo pointer")); if (inc->inc_faddr.s_addr != INADDR_ANY) { if (fib4_lookup_nh_ext(inc->inc_fibnum, inc->inc_faddr, NHR_REF, 0, &nh4) != 0) return (0); ifp = nh4.nh_ifp; maxmtu = nh4.nh_mtu; /* Report additional interface capabilities. */ if (cap != NULL) { if (ifp->if_capenable & IFCAP_TSO4 && ifp->if_hwassist & CSUM_TSO) { cap->ifcap |= CSUM_TSO; cap->tsomax = ifp->if_hw_tsomax; cap->tsomaxsegcount = ifp->if_hw_tsomaxsegcount; cap->tsomaxsegsize = ifp->if_hw_tsomaxsegsize; } } fib4_free_nh_ext(inc->inc_fibnum, &nh4); } return (maxmtu); } #endif /* INET */ #ifdef INET6 uint32_t tcp_maxmtu6(struct in_conninfo *inc, struct tcp_ifcap *cap) { struct nhop6_extended nh6; struct in6_addr dst6; uint32_t scopeid; struct ifnet *ifp; uint32_t maxmtu = 0; KASSERT(inc != NULL, ("tcp_maxmtu6 with NULL in_conninfo pointer")); if (!IN6_IS_ADDR_UNSPECIFIED(&inc->inc6_faddr)) { in6_splitscope(&inc->inc6_faddr, &dst6, &scopeid); if (fib6_lookup_nh_ext(inc->inc_fibnum, &dst6, scopeid, 0, 0, &nh6) != 0) return (0); ifp = nh6.nh_ifp; maxmtu = nh6.nh_mtu; /* Report additional interface capabilities. */ if (cap != NULL) { if (ifp->if_capenable & IFCAP_TSO6 && ifp->if_hwassist & CSUM_TSO) { cap->ifcap |= CSUM_TSO; cap->tsomax = ifp->if_hw_tsomax; cap->tsomaxsegcount = ifp->if_hw_tsomaxsegcount; cap->tsomaxsegsize = ifp->if_hw_tsomaxsegsize; } } fib6_free_nh_ext(inc->inc_fibnum, &nh6); } return (maxmtu); } #endif /* INET6 */ /* * Calculate effective SMSS per RFC5681 definition for a given TCP * connection at its current state, taking into account SACK and etc. */ u_int tcp_maxseg(const struct tcpcb *tp) { u_int optlen; if (tp->t_flags & TF_NOOPT) return (tp->t_maxseg); /* * Here we have a simplified code from tcp_addoptions(), * without a proper loop, and having most of paddings hardcoded. * We might make mistakes with padding here in some edge cases, * but this is harmless, since result of tcp_maxseg() is used * only in cwnd and ssthresh estimations. */ #define PAD(len) ((((len) / 4) + !!((len) % 4)) * 4) if (TCPS_HAVEESTABLISHED(tp->t_state)) { if (tp->t_flags & TF_RCVD_TSTMP) optlen = TCPOLEN_TSTAMP_APPA; else optlen = 0; #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) if (tp->t_flags & TF_SIGNATURE) optlen += PAD(TCPOLEN_SIGNATURE); #endif if ((tp->t_flags & TF_SACK_PERMIT) && tp->rcv_numsacks > 0) { optlen += TCPOLEN_SACKHDR; optlen += tp->rcv_numsacks * TCPOLEN_SACK; optlen = PAD(optlen); } } else { if (tp->t_flags & TF_REQ_TSTMP) optlen = TCPOLEN_TSTAMP_APPA; else optlen = PAD(TCPOLEN_MAXSEG); if (tp->t_flags & TF_REQ_SCALE) optlen += PAD(TCPOLEN_WINDOW); #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) if (tp->t_flags & TF_SIGNATURE) optlen += PAD(TCPOLEN_SIGNATURE); #endif if (tp->t_flags & TF_SACK_PERMIT) optlen += PAD(TCPOLEN_SACK_PERMITTED); } #undef PAD optlen = min(optlen, TCP_MAXOLEN); return (tp->t_maxseg - optlen); } static int sysctl_drop(SYSCTL_HANDLER_ARGS) { /* addrs[0] is a foreign socket, addrs[1] is a local one. */ struct sockaddr_storage addrs[2]; struct inpcb *inp; struct tcpcb *tp; struct tcptw *tw; struct sockaddr_in *fin, *lin; #ifdef INET6 struct sockaddr_in6 *fin6, *lin6; #endif int error; inp = NULL; fin = lin = NULL; #ifdef INET6 fin6 = lin6 = NULL; #endif error = 0; if (req->oldptr != NULL || req->oldlen != 0) return (EINVAL); if (req->newptr == NULL) return (EPERM); if (req->newlen < sizeof(addrs)) return (ENOMEM); error = SYSCTL_IN(req, &addrs, sizeof(addrs)); if (error) return (error); switch (addrs[0].ss_family) { #ifdef INET6 case AF_INET6: fin6 = (struct sockaddr_in6 *)&addrs[0]; lin6 = (struct sockaddr_in6 *)&addrs[1]; if (fin6->sin6_len != sizeof(struct sockaddr_in6) || lin6->sin6_len != sizeof(struct sockaddr_in6)) return (EINVAL); if (IN6_IS_ADDR_V4MAPPED(&fin6->sin6_addr)) { if (!IN6_IS_ADDR_V4MAPPED(&lin6->sin6_addr)) return (EINVAL); in6_sin6_2_sin_in_sock((struct sockaddr *)&addrs[0]); in6_sin6_2_sin_in_sock((struct sockaddr *)&addrs[1]); fin = (struct sockaddr_in *)&addrs[0]; lin = (struct sockaddr_in *)&addrs[1]; break; } error = sa6_embedscope(fin6, V_ip6_use_defzone); if (error) return (error); error = sa6_embedscope(lin6, V_ip6_use_defzone); if (error) return (error); break; #endif #ifdef INET case AF_INET: fin = (struct sockaddr_in *)&addrs[0]; lin = (struct sockaddr_in *)&addrs[1]; if (fin->sin_len != sizeof(struct sockaddr_in) || lin->sin_len != sizeof(struct sockaddr_in)) return (EINVAL); break; #endif default: return (EINVAL); } INP_INFO_RLOCK(&V_tcbinfo); switch (addrs[0].ss_family) { #ifdef INET6 case AF_INET6: inp = in6_pcblookup(&V_tcbinfo, &fin6->sin6_addr, fin6->sin6_port, &lin6->sin6_addr, lin6->sin6_port, INPLOOKUP_WLOCKPCB, NULL); break; #endif #ifdef INET case AF_INET: inp = in_pcblookup(&V_tcbinfo, fin->sin_addr, fin->sin_port, lin->sin_addr, lin->sin_port, INPLOOKUP_WLOCKPCB, NULL); break; #endif } if (inp != NULL) { if (inp->inp_flags & INP_TIMEWAIT) { /* * XXXRW: There currently exists a state where an * inpcb is present, but its timewait state has been * discarded. For now, don't allow dropping of this * type of inpcb. */ tw = intotw(inp); if (tw != NULL) tcp_twclose(tw, 0); else INP_WUNLOCK(inp); } else if (!(inp->inp_flags & INP_DROPPED) && !(inp->inp_socket->so_options & SO_ACCEPTCONN)) { tp = intotcpcb(inp); tp = tcp_drop(tp, ECONNABORTED); if (tp != NULL) INP_WUNLOCK(inp); } else INP_WUNLOCK(inp); } else error = ESRCH; INP_INFO_RUNLOCK(&V_tcbinfo); return (error); } SYSCTL_PROC(_net_inet_tcp, TCPCTL_DROP, drop, CTLFLAG_VNET | CTLTYPE_STRUCT | CTLFLAG_WR | CTLFLAG_SKIP, NULL, 0, sysctl_drop, "", "Drop TCP connection"); /* * Generate a standardized TCP log line for use throughout the * tcp subsystem. Memory allocation is done with M_NOWAIT to * allow use in the interrupt context. * * NB: The caller MUST free(s, M_TCPLOG) the returned string. * NB: The function may return NULL if memory allocation failed. * * Due to header inclusion and ordering limitations the struct ip * and ip6_hdr pointers have to be passed as void pointers. */ char * tcp_log_vain(struct in_conninfo *inc, struct tcphdr *th, void *ip4hdr, const void *ip6hdr) { /* Is logging enabled? */ if (tcp_log_in_vain == 0) return (NULL); return (tcp_log_addr(inc, th, ip4hdr, ip6hdr)); } char * tcp_log_addrs(struct in_conninfo *inc, struct tcphdr *th, void *ip4hdr, const void *ip6hdr) { /* Is logging enabled? */ if (tcp_log_debug == 0) return (NULL); return (tcp_log_addr(inc, th, ip4hdr, ip6hdr)); } static char * tcp_log_addr(struct in_conninfo *inc, struct tcphdr *th, void *ip4hdr, const void *ip6hdr) { char *s, *sp; size_t size; struct ip *ip; #ifdef INET6 const struct ip6_hdr *ip6; ip6 = (const struct ip6_hdr *)ip6hdr; #endif /* INET6 */ ip = (struct ip *)ip4hdr; /* * The log line looks like this: * "TCP: [1.2.3.4]:50332 to [1.2.3.4]:80 tcpflags 0x2" */ size = sizeof("TCP: []:12345 to []:12345 tcpflags 0x2<>") + sizeof(PRINT_TH_FLAGS) + 1 + #ifdef INET6 2 * INET6_ADDRSTRLEN; #else 2 * INET_ADDRSTRLEN; #endif /* INET6 */ s = malloc(size, M_TCPLOG, M_ZERO|M_NOWAIT); if (s == NULL) return (NULL); strcat(s, "TCP: ["); sp = s + strlen(s); if (inc && ((inc->inc_flags & INC_ISIPV6) == 0)) { inet_ntoa_r(inc->inc_faddr, sp); sp = s + strlen(s); sprintf(sp, "]:%i to [", ntohs(inc->inc_fport)); sp = s + strlen(s); inet_ntoa_r(inc->inc_laddr, sp); sp = s + strlen(s); sprintf(sp, "]:%i", ntohs(inc->inc_lport)); #ifdef INET6 } else if (inc) { ip6_sprintf(sp, &inc->inc6_faddr); sp = s + strlen(s); sprintf(sp, "]:%i to [", ntohs(inc->inc_fport)); sp = s + strlen(s); ip6_sprintf(sp, &inc->inc6_laddr); sp = s + strlen(s); sprintf(sp, "]:%i", ntohs(inc->inc_lport)); } else if (ip6 && th) { ip6_sprintf(sp, &ip6->ip6_src); sp = s + strlen(s); sprintf(sp, "]:%i to [", ntohs(th->th_sport)); sp = s + strlen(s); ip6_sprintf(sp, &ip6->ip6_dst); sp = s + strlen(s); sprintf(sp, "]:%i", ntohs(th->th_dport)); #endif /* INET6 */ #ifdef INET } else if (ip && th) { inet_ntoa_r(ip->ip_src, sp); sp = s + strlen(s); sprintf(sp, "]:%i to [", ntohs(th->th_sport)); sp = s + strlen(s); inet_ntoa_r(ip->ip_dst, sp); sp = s + strlen(s); sprintf(sp, "]:%i", ntohs(th->th_dport)); #endif /* INET */ } else { free(s, M_TCPLOG); return (NULL); } sp = s + strlen(s); if (th) sprintf(sp, " tcpflags 0x%b", th->th_flags, PRINT_TH_FLAGS); if (*(s + size - 1) != '\0') panic("%s: string too long", __func__); return (s); } /* * A subroutine which makes it easy to track TCP state changes with DTrace. * This function shouldn't be called for t_state initializations that don't * correspond to actual TCP state transitions. */ void tcp_state_change(struct tcpcb *tp, int newstate) { #if defined(KDTRACE_HOOKS) int pstate = tp->t_state; #endif TCPSTATES_DEC(tp->t_state); TCPSTATES_INC(newstate); tp->t_state = newstate; TCP_PROBE6(state__change, NULL, tp, NULL, tp, NULL, pstate); } /* * Create an external-format (``xtcpcb'') structure using the information in * the kernel-format tcpcb structure pointed to by tp. This is done to * reduce the spew of irrelevant information over this interface, to isolate * user code from changes in the kernel structure, and potentially to provide * information-hiding if we decide that some of this information should be * hidden from users. */ void tcp_inptoxtp(const struct inpcb *inp, struct xtcpcb *xt) { struct tcpcb *tp = intotcpcb(inp); sbintime_t now; if (inp->inp_flags & INP_TIMEWAIT) { bzero(xt, sizeof(struct xtcpcb)); xt->t_state = TCPS_TIME_WAIT; } else { xt->t_state = tp->t_state; xt->t_flags = tp->t_flags; xt->t_sndzerowin = tp->t_sndzerowin; xt->t_sndrexmitpack = tp->t_sndrexmitpack; xt->t_rcvoopack = tp->t_rcvoopack; now = getsbinuptime(); #define COPYTIMER(ttt) do { \ if (callout_active(&tp->t_timers->ttt)) \ xt->ttt = (tp->t_timers->ttt.c_time - now) / \ SBT_1MS; \ else \ xt->ttt = 0; \ } while (0) COPYTIMER(tt_delack); COPYTIMER(tt_rexmt); COPYTIMER(tt_persist); COPYTIMER(tt_keep); COPYTIMER(tt_2msl); #undef COPYTIMER xt->t_rcvtime = 1000 * (ticks - tp->t_rcvtime) / hz; bcopy(tp->t_fb->tfb_tcp_block_name, xt->xt_stack, TCP_FUNCTION_NAME_LEN_MAX); } xt->xt_len = sizeof(struct xtcpcb); in_pcbtoxinpcb(inp, &xt->xt_inp); if (inp->inp_socket == NULL) xt->xt_inp.xi_socket.xso_protocol = IPPROTO_TCP; } Index: head/sys/netinet/tcp_syncache.c =================================================================== --- head/sys/netinet/tcp_syncache.c (revision 319721) +++ head/sys/netinet/tcp_syncache.c (revision 319722) @@ -1,2256 +1,2257 @@ /*- * Copyright (c) 2001 McAfee, Inc. * Copyright (c) 2006,2013 Andre Oppermann, Internet Business Solutions AG * All rights reserved. * * This software was developed for the FreeBSD Project by Jonathan Lemon * and McAfee Research, the Security Research Division of McAfee, Inc. under * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the * DARPA CHATS research program. [2001 McAfee, Inc.] * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_pcbgroup.h" #include #include #include #include #include #include #include #include #include #include #include #include /* for proc0 declaration */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #include #include #include #include #endif #include #ifdef TCP_RFC7413 #include #endif #include #include #include #include #include #ifdef INET6 #include #endif #ifdef TCP_OFFLOAD #include #endif #include #include #include static VNET_DEFINE(int, tcp_syncookies) = 1; #define V_tcp_syncookies VNET(tcp_syncookies) SYSCTL_INT(_net_inet_tcp, OID_AUTO, syncookies, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_syncookies), 0, "Use TCP SYN cookies if the syncache overflows"); static VNET_DEFINE(int, tcp_syncookiesonly) = 0; #define V_tcp_syncookiesonly VNET(tcp_syncookiesonly) SYSCTL_INT(_net_inet_tcp, OID_AUTO, syncookies_only, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_syncookiesonly), 0, "Use only TCP SYN cookies"); static VNET_DEFINE(int, functions_inherit_listen_socket_stack) = 1; #define V_functions_inherit_listen_socket_stack \ VNET(functions_inherit_listen_socket_stack) SYSCTL_INT(_net_inet_tcp, OID_AUTO, functions_inherit_listen_socket_stack, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(functions_inherit_listen_socket_stack), 0, "Inherit listen socket's stack"); #ifdef TCP_OFFLOAD #define ADDED_BY_TOE(sc) ((sc)->sc_tod != NULL) #endif static void syncache_drop(struct syncache *, struct syncache_head *); static void syncache_free(struct syncache *); static void syncache_insert(struct syncache *, struct syncache_head *); static int syncache_respond(struct syncache *, struct syncache_head *, int, const struct mbuf *); static struct socket *syncache_socket(struct syncache *, struct socket *, struct mbuf *m); static void syncache_timeout(struct syncache *sc, struct syncache_head *sch, int docallout); static void syncache_timer(void *); static uint32_t syncookie_mac(struct in_conninfo *, tcp_seq, uint8_t, uint8_t *, uintptr_t); static tcp_seq syncookie_generate(struct syncache_head *, struct syncache *); static struct syncache *syncookie_lookup(struct in_conninfo *, struct syncache_head *, struct syncache *, struct tcphdr *, struct tcpopt *, struct socket *); static void syncookie_reseed(void *); #ifdef INVARIANTS static int syncookie_cmp(struct in_conninfo *inc, struct syncache_head *sch, struct syncache *sc, struct tcphdr *th, struct tcpopt *to, struct socket *lso); #endif /* * Transmit the SYN,ACK fewer times than TCP_MAXRXTSHIFT specifies. * 3 retransmits corresponds to a timeout of 3 * (1 + 2 + 4 + 8) == 45 seconds, * the odds are that the user has given up attempting to connect by then. */ #define SYNCACHE_MAXREXMTS 3 /* Arbitrary values */ #define TCP_SYNCACHE_HASHSIZE 512 #define TCP_SYNCACHE_BUCKETLIMIT 30 static VNET_DEFINE(struct tcp_syncache, tcp_syncache); #define V_tcp_syncache VNET(tcp_syncache) static SYSCTL_NODE(_net_inet_tcp, OID_AUTO, syncache, CTLFLAG_RW, 0, "TCP SYN cache"); SYSCTL_UINT(_net_inet_tcp_syncache, OID_AUTO, bucketlimit, CTLFLAG_VNET | CTLFLAG_RDTUN, &VNET_NAME(tcp_syncache.bucket_limit), 0, "Per-bucket hash limit for syncache"); SYSCTL_UINT(_net_inet_tcp_syncache, OID_AUTO, cachelimit, CTLFLAG_VNET | CTLFLAG_RDTUN, &VNET_NAME(tcp_syncache.cache_limit), 0, "Overall entry limit for syncache"); SYSCTL_UMA_CUR(_net_inet_tcp_syncache, OID_AUTO, count, CTLFLAG_VNET, &VNET_NAME(tcp_syncache.zone), "Current number of entries in syncache"); SYSCTL_UINT(_net_inet_tcp_syncache, OID_AUTO, hashsize, CTLFLAG_VNET | CTLFLAG_RDTUN, &VNET_NAME(tcp_syncache.hashsize), 0, "Size of TCP syncache hashtable"); SYSCTL_UINT(_net_inet_tcp_syncache, OID_AUTO, rexmtlimit, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_syncache.rexmt_limit), 0, "Limit on SYN/ACK retransmissions"); VNET_DEFINE(int, tcp_sc_rst_sock_fail) = 1; SYSCTL_INT(_net_inet_tcp_syncache, OID_AUTO, rst_on_sock_fail, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_sc_rst_sock_fail), 0, "Send reset on socket allocation failure"); static MALLOC_DEFINE(M_SYNCACHE, "syncache", "TCP syncache"); #define SCH_LOCK(sch) mtx_lock(&(sch)->sch_mtx) #define SCH_UNLOCK(sch) mtx_unlock(&(sch)->sch_mtx) #define SCH_LOCK_ASSERT(sch) mtx_assert(&(sch)->sch_mtx, MA_OWNED) /* * Requires the syncache entry to be already removed from the bucket list. */ static void syncache_free(struct syncache *sc) { if (sc->sc_ipopts) (void) m_free(sc->sc_ipopts); if (sc->sc_cred) crfree(sc->sc_cred); #ifdef MAC mac_syncache_destroy(&sc->sc_label); #endif uma_zfree(V_tcp_syncache.zone, sc); } void syncache_init(void) { int i; V_tcp_syncache.hashsize = TCP_SYNCACHE_HASHSIZE; V_tcp_syncache.bucket_limit = TCP_SYNCACHE_BUCKETLIMIT; V_tcp_syncache.rexmt_limit = SYNCACHE_MAXREXMTS; V_tcp_syncache.hash_secret = arc4random(); TUNABLE_INT_FETCH("net.inet.tcp.syncache.hashsize", &V_tcp_syncache.hashsize); TUNABLE_INT_FETCH("net.inet.tcp.syncache.bucketlimit", &V_tcp_syncache.bucket_limit); if (!powerof2(V_tcp_syncache.hashsize) || V_tcp_syncache.hashsize == 0) { printf("WARNING: syncache hash size is not a power of 2.\n"); V_tcp_syncache.hashsize = TCP_SYNCACHE_HASHSIZE; } V_tcp_syncache.hashmask = V_tcp_syncache.hashsize - 1; /* Set limits. */ V_tcp_syncache.cache_limit = V_tcp_syncache.hashsize * V_tcp_syncache.bucket_limit; TUNABLE_INT_FETCH("net.inet.tcp.syncache.cachelimit", &V_tcp_syncache.cache_limit); /* Allocate the hash table. */ V_tcp_syncache.hashbase = malloc(V_tcp_syncache.hashsize * sizeof(struct syncache_head), M_SYNCACHE, M_WAITOK | M_ZERO); #ifdef VIMAGE V_tcp_syncache.vnet = curvnet; #endif /* Initialize the hash buckets. */ for (i = 0; i < V_tcp_syncache.hashsize; i++) { TAILQ_INIT(&V_tcp_syncache.hashbase[i].sch_bucket); mtx_init(&V_tcp_syncache.hashbase[i].sch_mtx, "tcp_sc_head", NULL, MTX_DEF); callout_init_mtx(&V_tcp_syncache.hashbase[i].sch_timer, &V_tcp_syncache.hashbase[i].sch_mtx, 0); V_tcp_syncache.hashbase[i].sch_length = 0; V_tcp_syncache.hashbase[i].sch_sc = &V_tcp_syncache; V_tcp_syncache.hashbase[i].sch_last_overflow = -(SYNCOOKIE_LIFETIME + 1); } /* Create the syncache entry zone. */ V_tcp_syncache.zone = uma_zcreate("syncache", sizeof(struct syncache), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); V_tcp_syncache.cache_limit = uma_zone_set_max(V_tcp_syncache.zone, V_tcp_syncache.cache_limit); /* Start the SYN cookie reseeder callout. */ callout_init(&V_tcp_syncache.secret.reseed, 1); arc4rand(V_tcp_syncache.secret.key[0], SYNCOOKIE_SECRET_SIZE, 0); arc4rand(V_tcp_syncache.secret.key[1], SYNCOOKIE_SECRET_SIZE, 0); callout_reset(&V_tcp_syncache.secret.reseed, SYNCOOKIE_LIFETIME * hz, syncookie_reseed, &V_tcp_syncache); } #ifdef VIMAGE void syncache_destroy(void) { struct syncache_head *sch; struct syncache *sc, *nsc; int i; /* * Stop the re-seed timer before freeing resources. No need to * possibly schedule it another time. */ callout_drain(&V_tcp_syncache.secret.reseed); /* Cleanup hash buckets: stop timers, free entries, destroy locks. */ for (i = 0; i < V_tcp_syncache.hashsize; i++) { sch = &V_tcp_syncache.hashbase[i]; callout_drain(&sch->sch_timer); SCH_LOCK(sch); TAILQ_FOREACH_SAFE(sc, &sch->sch_bucket, sc_hash, nsc) syncache_drop(sc, sch); SCH_UNLOCK(sch); KASSERT(TAILQ_EMPTY(&sch->sch_bucket), ("%s: sch->sch_bucket not empty", __func__)); KASSERT(sch->sch_length == 0, ("%s: sch->sch_length %d not 0", __func__, sch->sch_length)); mtx_destroy(&sch->sch_mtx); } KASSERT(uma_zone_get_cur(V_tcp_syncache.zone) == 0, ("%s: cache_count not 0", __func__)); /* Free the allocated global resources. */ uma_zdestroy(V_tcp_syncache.zone); free(V_tcp_syncache.hashbase, M_SYNCACHE); } #endif /* * Inserts a syncache entry into the specified bucket row. * Locks and unlocks the syncache_head autonomously. */ static void syncache_insert(struct syncache *sc, struct syncache_head *sch) { struct syncache *sc2; SCH_LOCK(sch); /* * Make sure that we don't overflow the per-bucket limit. * If the bucket is full, toss the oldest element. */ if (sch->sch_length >= V_tcp_syncache.bucket_limit) { KASSERT(!TAILQ_EMPTY(&sch->sch_bucket), ("sch->sch_length incorrect")); sc2 = TAILQ_LAST(&sch->sch_bucket, sch_head); sch->sch_last_overflow = time_uptime; syncache_drop(sc2, sch); TCPSTAT_INC(tcps_sc_bucketoverflow); } /* Put it into the bucket. */ TAILQ_INSERT_HEAD(&sch->sch_bucket, sc, sc_hash); sch->sch_length++; #ifdef TCP_OFFLOAD if (ADDED_BY_TOE(sc)) { struct toedev *tod = sc->sc_tod; tod->tod_syncache_added(tod, sc->sc_todctx); } #endif /* Reinitialize the bucket row's timer. */ if (sch->sch_length == 1) sch->sch_nextc = ticks + INT_MAX; syncache_timeout(sc, sch, 1); SCH_UNLOCK(sch); TCPSTATES_INC(TCPS_SYN_RECEIVED); TCPSTAT_INC(tcps_sc_added); } /* * Remove and free entry from syncache bucket row. * Expects locked syncache head. */ static void syncache_drop(struct syncache *sc, struct syncache_head *sch) { SCH_LOCK_ASSERT(sch); TCPSTATES_DEC(TCPS_SYN_RECEIVED); TAILQ_REMOVE(&sch->sch_bucket, sc, sc_hash); sch->sch_length--; #ifdef TCP_OFFLOAD if (ADDED_BY_TOE(sc)) { struct toedev *tod = sc->sc_tod; tod->tod_syncache_removed(tod, sc->sc_todctx); } #endif syncache_free(sc); } /* * Engage/reengage time on bucket row. */ static void syncache_timeout(struct syncache *sc, struct syncache_head *sch, int docallout) { sc->sc_rxttime = ticks + TCPTV_RTOBASE * (tcp_syn_backoff[sc->sc_rxmits]); sc->sc_rxmits++; if (TSTMP_LT(sc->sc_rxttime, sch->sch_nextc)) { sch->sch_nextc = sc->sc_rxttime; if (docallout) callout_reset(&sch->sch_timer, sch->sch_nextc - ticks, syncache_timer, (void *)sch); } } /* * Walk the timer queues, looking for SYN,ACKs that need to be retransmitted. * If we have retransmitted an entry the maximum number of times, expire it. * One separate timer for each bucket row. */ static void syncache_timer(void *xsch) { struct syncache_head *sch = (struct syncache_head *)xsch; struct syncache *sc, *nsc; int tick = ticks; char *s; CURVNET_SET(sch->sch_sc->vnet); /* NB: syncache_head has already been locked by the callout. */ SCH_LOCK_ASSERT(sch); /* * In the following cycle we may remove some entries and/or * advance some timeouts, so re-initialize the bucket timer. */ sch->sch_nextc = tick + INT_MAX; TAILQ_FOREACH_SAFE(sc, &sch->sch_bucket, sc_hash, nsc) { /* * We do not check if the listen socket still exists * and accept the case where the listen socket may be * gone by the time we resend the SYN/ACK. We do * not expect this to happens often. If it does, * then the RST will be sent by the time the remote * host does the SYN/ACK->ACK. */ if (TSTMP_GT(sc->sc_rxttime, tick)) { if (TSTMP_LT(sc->sc_rxttime, sch->sch_nextc)) sch->sch_nextc = sc->sc_rxttime; continue; } if (sc->sc_rxmits > V_tcp_syncache.rexmt_limit) { if ((s = tcp_log_addrs(&sc->sc_inc, NULL, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: Retransmits exhausted, " "giving up and removing syncache entry\n", s, __func__); free(s, M_TCPLOG); } syncache_drop(sc, sch); TCPSTAT_INC(tcps_sc_stale); continue; } if ((s = tcp_log_addrs(&sc->sc_inc, NULL, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: Response timeout, " "retransmitting (%u) SYN|ACK\n", s, __func__, sc->sc_rxmits); free(s, M_TCPLOG); } syncache_respond(sc, sch, 1, NULL); TCPSTAT_INC(tcps_sc_retransmitted); syncache_timeout(sc, sch, 0); } if (!TAILQ_EMPTY(&(sch)->sch_bucket)) callout_reset(&(sch)->sch_timer, (sch)->sch_nextc - tick, syncache_timer, (void *)(sch)); CURVNET_RESTORE(); } /* * Find an entry in the syncache. * Returns always with locked syncache_head plus a matching entry or NULL. */ static struct syncache * syncache_lookup(struct in_conninfo *inc, struct syncache_head **schp) { struct syncache *sc; struct syncache_head *sch; uint32_t hash; /* * The hash is built on foreign port + local port + foreign address. * We rely on the fact that struct in_conninfo starts with 16 bits * of foreign port, then 16 bits of local port then followed by 128 * bits of foreign address. In case of IPv4 address, the first 3 * 32-bit words of the address always are zeroes. */ hash = jenkins_hash32((uint32_t *)&inc->inc_ie, 5, V_tcp_syncache.hash_secret) & V_tcp_syncache.hashmask; sch = &V_tcp_syncache.hashbase[hash]; *schp = sch; SCH_LOCK(sch); /* Circle through bucket row to find matching entry. */ TAILQ_FOREACH(sc, &sch->sch_bucket, sc_hash) if (bcmp(&inc->inc_ie, &sc->sc_inc.inc_ie, sizeof(struct in_endpoints)) == 0) break; return (sc); /* Always returns with locked sch. */ } /* * This function is called when we get a RST for a * non-existent connection, so that we can see if the * connection is in the syn cache. If it is, zap it. */ void syncache_chkrst(struct in_conninfo *inc, struct tcphdr *th) { struct syncache *sc; struct syncache_head *sch; char *s = NULL; sc = syncache_lookup(inc, &sch); /* returns locked sch */ SCH_LOCK_ASSERT(sch); /* * Any RST to our SYN|ACK must not carry ACK, SYN or FIN flags. * See RFC 793 page 65, section SEGMENT ARRIVES. */ if (th->th_flags & (TH_ACK|TH_SYN|TH_FIN)) { if ((s = tcp_log_addrs(inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Spurious RST with ACK, SYN or " "FIN flag set, segment ignored\n", s, __func__); TCPSTAT_INC(tcps_badrst); goto done; } /* * No corresponding connection was found in syncache. * If syncookies are enabled and possibly exclusively * used, or we are under memory pressure, a valid RST * may not find a syncache entry. In that case we're * done and no SYN|ACK retransmissions will happen. * Otherwise the RST was misdirected or spoofed. */ if (sc == NULL) { if ((s = tcp_log_addrs(inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Spurious RST without matching " "syncache entry (possibly syncookie only), " "segment ignored\n", s, __func__); TCPSTAT_INC(tcps_badrst); goto done; } /* * If the RST bit is set, check the sequence number to see * if this is a valid reset segment. * RFC 793 page 37: * In all states except SYN-SENT, all reset (RST) segments * are validated by checking their SEQ-fields. A reset is * valid if its sequence number is in the window. * * The sequence number in the reset segment is normally an * echo of our outgoing acknowlegement numbers, but some hosts * send a reset with the sequence number at the rightmost edge * of our receive window, and we have to handle this case. */ if (SEQ_GEQ(th->th_seq, sc->sc_irs) && SEQ_LEQ(th->th_seq, sc->sc_irs + sc->sc_wnd)) { syncache_drop(sc, sch); if ((s = tcp_log_addrs(inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Our SYN|ACK was rejected, " "connection attempt aborted by remote endpoint\n", s, __func__); TCPSTAT_INC(tcps_sc_reset); } else { if ((s = tcp_log_addrs(inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: RST with invalid SEQ %u != " "IRS %u (+WND %u), segment ignored\n", s, __func__, th->th_seq, sc->sc_irs, sc->sc_wnd); TCPSTAT_INC(tcps_badrst); } done: if (s != NULL) free(s, M_TCPLOG); SCH_UNLOCK(sch); } void syncache_badack(struct in_conninfo *inc) { struct syncache *sc; struct syncache_head *sch; sc = syncache_lookup(inc, &sch); /* returns locked sch */ SCH_LOCK_ASSERT(sch); if (sc != NULL) { syncache_drop(sc, sch); TCPSTAT_INC(tcps_sc_badack); } SCH_UNLOCK(sch); } void syncache_unreach(struct in_conninfo *inc, tcp_seq th_seq) { struct syncache *sc; struct syncache_head *sch; sc = syncache_lookup(inc, &sch); /* returns locked sch */ SCH_LOCK_ASSERT(sch); if (sc == NULL) goto done; /* If the sequence number != sc_iss, then it's a bogus ICMP msg */ if (ntohl(th_seq) != sc->sc_iss) goto done; /* * If we've rertransmitted 3 times and this is our second error, * we remove the entry. Otherwise, we allow it to continue on. * This prevents us from incorrectly nuking an entry during a * spurious network outage. * * See tcp_notify(). */ if ((sc->sc_flags & SCF_UNREACH) == 0 || sc->sc_rxmits < 3 + 1) { sc->sc_flags |= SCF_UNREACH; goto done; } syncache_drop(sc, sch); TCPSTAT_INC(tcps_sc_unreach); done: SCH_UNLOCK(sch); } /* * Build a new TCP socket structure from a syncache entry. * * On success return the newly created socket with its underlying inp locked. */ static struct socket * syncache_socket(struct syncache *sc, struct socket *lso, struct mbuf *m) { struct tcp_function_block *blk; struct inpcb *inp = NULL; struct socket *so; struct tcpcb *tp; int error; char *s; INP_INFO_RLOCK_ASSERT(&V_tcbinfo); /* * Ok, create the full blown connection, and set things up * as they would have been set up if we had created the * connection when the SYN arrived. If we can't create * the connection, abort it. */ so = sonewconn(lso, 0); if (so == NULL) { /* * Drop the connection; we will either send a RST or * have the peer retransmit its SYN again after its * RTO and try again. */ TCPSTAT_INC(tcps_listendrop); if ((s = tcp_log_addrs(&sc->sc_inc, NULL, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: Socket create failed " "due to limits or memory shortage\n", s, __func__); free(s, M_TCPLOG); } goto abort2; } #ifdef MAC mac_socketpeer_set_from_mbuf(m, so); #endif inp = sotoinpcb(so); inp->inp_inc.inc_fibnum = so->so_fibnum; INP_WLOCK(inp); /* * Exclusive pcbinfo lock is not required in syncache socket case even * if two inpcb locks can be acquired simultaneously: * - the inpcb in LISTEN state, * - the newly created inp. * * In this case, an inp cannot be at same time in LISTEN state and * just created by an accept() call. */ INP_HASH_WLOCK(&V_tcbinfo); /* Insert new socket into PCB hash list. */ inp->inp_inc.inc_flags = sc->sc_inc.inc_flags; #ifdef INET6 if (sc->sc_inc.inc_flags & INC_ISIPV6) { inp->in6p_laddr = sc->sc_inc.inc6_laddr; } else { inp->inp_vflag &= ~INP_IPV6; inp->inp_vflag |= INP_IPV4; #endif inp->inp_laddr = sc->sc_inc.inc_laddr; #ifdef INET6 } #endif /* * If there's an mbuf and it has a flowid, then let's initialise the * inp with that particular flowid. */ if (m != NULL && M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) { inp->inp_flowid = m->m_pkthdr.flowid; inp->inp_flowtype = M_HASHTYPE_GET(m); } /* * Install in the reservation hash table for now, but don't yet * install a connection group since the full 4-tuple isn't yet * configured. */ inp->inp_lport = sc->sc_inc.inc_lport; if ((error = in_pcbinshash_nopcbgroup(inp)) != 0) { /* * Undo the assignments above if we failed to * put the PCB on the hash lists. */ #ifdef INET6 if (sc->sc_inc.inc_flags & INC_ISIPV6) inp->in6p_laddr = in6addr_any; else #endif inp->inp_laddr.s_addr = INADDR_ANY; inp->inp_lport = 0; if ((s = tcp_log_addrs(&sc->sc_inc, NULL, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: in_pcbinshash failed " "with error %i\n", s, __func__, error); free(s, M_TCPLOG); } INP_HASH_WUNLOCK(&V_tcbinfo); goto abort; } #ifdef INET6 if (sc->sc_inc.inc_flags & INC_ISIPV6) { struct inpcb *oinp = sotoinpcb(lso); struct in6_addr laddr6; struct sockaddr_in6 sin6; /* * Inherit socket options from the listening socket. * Note that in6p_inputopts are not (and should not be) * copied, since it stores previously received options and is * used to detect if each new option is different than the * previous one and hence should be passed to a user. * If we copied in6p_inputopts, a user would not be able to * receive options just after calling the accept system call. */ inp->inp_flags |= oinp->inp_flags & INP_CONTROLOPTS; if (oinp->in6p_outputopts) inp->in6p_outputopts = ip6_copypktopts(oinp->in6p_outputopts, M_NOWAIT); sin6.sin6_family = AF_INET6; sin6.sin6_len = sizeof(sin6); sin6.sin6_addr = sc->sc_inc.inc6_faddr; sin6.sin6_port = sc->sc_inc.inc_fport; sin6.sin6_flowinfo = sin6.sin6_scope_id = 0; laddr6 = inp->in6p_laddr; if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) inp->in6p_laddr = sc->sc_inc.inc6_laddr; if ((error = in6_pcbconnect_mbuf(inp, (struct sockaddr *)&sin6, thread0.td_ucred, m)) != 0) { inp->in6p_laddr = laddr6; if ((s = tcp_log_addrs(&sc->sc_inc, NULL, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: in6_pcbconnect failed " "with error %i\n", s, __func__, error); free(s, M_TCPLOG); } INP_HASH_WUNLOCK(&V_tcbinfo); goto abort; } /* Override flowlabel from in6_pcbconnect. */ inp->inp_flow &= ~IPV6_FLOWLABEL_MASK; inp->inp_flow |= sc->sc_flowlabel; } #endif /* INET6 */ #if defined(INET) && defined(INET6) else #endif #ifdef INET { struct in_addr laddr; struct sockaddr_in sin; inp->inp_options = (m) ? ip_srcroute(m) : NULL; if (inp->inp_options == NULL) { inp->inp_options = sc->sc_ipopts; sc->sc_ipopts = NULL; } sin.sin_family = AF_INET; sin.sin_len = sizeof(sin); sin.sin_addr = sc->sc_inc.inc_faddr; sin.sin_port = sc->sc_inc.inc_fport; bzero((caddr_t)sin.sin_zero, sizeof(sin.sin_zero)); laddr = inp->inp_laddr; if (inp->inp_laddr.s_addr == INADDR_ANY) inp->inp_laddr = sc->sc_inc.inc_laddr; if ((error = in_pcbconnect_mbuf(inp, (struct sockaddr *)&sin, thread0.td_ucred, m)) != 0) { inp->inp_laddr = laddr; if ((s = tcp_log_addrs(&sc->sc_inc, NULL, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: in_pcbconnect failed " "with error %i\n", s, __func__, error); free(s, M_TCPLOG); } INP_HASH_WUNLOCK(&V_tcbinfo); goto abort; } } #endif /* INET */ #if defined(IPSEC) || defined(IPSEC_SUPPORT) /* Copy old policy into new socket's. */ if (ipsec_copy_pcbpolicy(sotoinpcb(lso), inp) != 0) printf("syncache_socket: could not copy policy\n"); #endif INP_HASH_WUNLOCK(&V_tcbinfo); tp = intotcpcb(inp); tcp_state_change(tp, TCPS_SYN_RECEIVED); tp->iss = sc->sc_iss; tp->irs = sc->sc_irs; tcp_rcvseqinit(tp); tcp_sendseqinit(tp); blk = sototcpcb(lso)->t_fb; if (V_functions_inherit_listen_socket_stack && blk != tp->t_fb) { /* * Our parents t_fb was not the default, * we need to release our ref on tp->t_fb and * pickup one on the new entry. */ struct tcp_function_block *rblk; rblk = find_and_ref_tcp_fb(blk); KASSERT(rblk != NULL, ("cannot find blk %p out of syncache?", blk)); if (tp->t_fb->tfb_tcp_fb_fini) (*tp->t_fb->tfb_tcp_fb_fini)(tp, 0); refcount_release(&tp->t_fb->tfb_refcnt); tp->t_fb = rblk; if (tp->t_fb->tfb_tcp_fb_init) { (*tp->t_fb->tfb_tcp_fb_init)(tp); } } tp->snd_wl1 = sc->sc_irs; tp->snd_max = tp->iss + 1; tp->snd_nxt = tp->iss + 1; tp->rcv_up = sc->sc_irs + 1; tp->rcv_wnd = sc->sc_wnd; tp->rcv_adv += tp->rcv_wnd; tp->last_ack_sent = tp->rcv_nxt; tp->t_flags = sototcpcb(lso)->t_flags & (TF_NOPUSH|TF_NODELAY); if (sc->sc_flags & SCF_NOOPT) tp->t_flags |= TF_NOOPT; else { if (sc->sc_flags & SCF_WINSCALE) { tp->t_flags |= TF_REQ_SCALE|TF_RCVD_SCALE; tp->snd_scale = sc->sc_requested_s_scale; tp->request_r_scale = sc->sc_requested_r_scale; } if (sc->sc_flags & SCF_TIMESTAMP) { tp->t_flags |= TF_REQ_TSTMP|TF_RCVD_TSTMP; tp->ts_recent = sc->sc_tsreflect; tp->ts_recent_age = tcp_ts_getticks(); tp->ts_offset = sc->sc_tsoff; } #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) if (sc->sc_flags & SCF_SIGNATURE) tp->t_flags |= TF_SIGNATURE; #endif if (sc->sc_flags & SCF_SACK) tp->t_flags |= TF_SACK_PERMIT; } if (sc->sc_flags & SCF_ECN) tp->t_flags |= TF_ECN_PERMIT; /* * Set up MSS and get cached values from tcp_hostcache. * This might overwrite some of the defaults we just set. */ tcp_mss(tp, sc->sc_peer_mss); /* * If the SYN,ACK was retransmitted, indicate that CWND to be * limited to one segment in cc_conn_init(). * NB: sc_rxmits counts all SYN,ACK transmits, not just retransmits. */ if (sc->sc_rxmits > 1) tp->snd_cwnd = 1; #ifdef TCP_OFFLOAD /* * Allow a TOE driver to install its hooks. Note that we hold the * pcbinfo lock too and that prevents tcp_usr_accept from accepting a * new connection before the TOE driver has done its thing. */ if (ADDED_BY_TOE(sc)) { struct toedev *tod = sc->sc_tod; tod->tod_offload_socket(tod, sc->sc_todctx, so); } #endif /* * Copy and activate timers. */ tp->t_keepinit = sototcpcb(lso)->t_keepinit; tp->t_keepidle = sototcpcb(lso)->t_keepidle; tp->t_keepintvl = sototcpcb(lso)->t_keepintvl; tp->t_keepcnt = sototcpcb(lso)->t_keepcnt; tcp_timer_activate(tp, TT_KEEP, TP_KEEPINIT(tp)); TCPSTAT_INC(tcps_accepts); return (so); abort: INP_WUNLOCK(inp); abort2: if (so != NULL) soabort(so); return (NULL); } /* * This function gets called when we receive an ACK for a * socket in the LISTEN state. We look up the connection * in the syncache, and if its there, we pull it out of * the cache and turn it into a full-blown connection in * the SYN-RECEIVED state. * * On syncache_socket() success the newly created socket * has its underlying inp locked. */ int syncache_expand(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, struct socket **lsop, struct mbuf *m) { struct syncache *sc; struct syncache_head *sch; struct syncache scs; char *s; /* * Global TCP locks are held because we manipulate the PCB lists * and create a new socket. */ INP_INFO_RLOCK_ASSERT(&V_tcbinfo); KASSERT((th->th_flags & (TH_RST|TH_ACK|TH_SYN)) == TH_ACK, ("%s: can handle only ACK", __func__)); sc = syncache_lookup(inc, &sch); /* returns locked sch */ SCH_LOCK_ASSERT(sch); #ifdef INVARIANTS /* * Test code for syncookies comparing the syncache stored * values with the reconstructed values from the cookie. */ if (sc != NULL) syncookie_cmp(inc, sch, sc, th, to, *lsop); #endif if (sc == NULL) { /* * There is no syncache entry, so see if this ACK is * a returning syncookie. To do this, first: * A. Check if syncookies are used in case of syncache * overflows * B. See if this socket has had a syncache entry dropped in * the recent past. We don't want to accept a bogus * syncookie if we've never received a SYN or accept it * twice. * C. check that the syncookie is valid. If it is, then * cobble up a fake syncache entry, and return. */ if (!V_tcp_syncookies) { SCH_UNLOCK(sch); if ((s = tcp_log_addrs(inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Spurious ACK, " "segment rejected (syncookies disabled)\n", s, __func__); goto failed; } if (!V_tcp_syncookiesonly && sch->sch_last_overflow < time_uptime - SYNCOOKIE_LIFETIME) { SCH_UNLOCK(sch); if ((s = tcp_log_addrs(inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Spurious ACK, " "segment rejected (no syncache entry)\n", s, __func__); goto failed; } bzero(&scs, sizeof(scs)); sc = syncookie_lookup(inc, sch, &scs, th, to, *lsop); SCH_UNLOCK(sch); if (sc == NULL) { if ((s = tcp_log_addrs(inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Segment failed " "SYNCOOKIE authentication, segment rejected " "(probably spoofed)\n", s, __func__); goto failed; } #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) /* If received ACK has MD5 signature, check it. */ if ((to->to_flags & TOF_SIGNATURE) != 0 && (!TCPMD5_ENABLED() || TCPMD5_INPUT(m, th, to->to_signature) != 0)) { /* Drop the ACK. */ if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: Segment rejected, " "MD5 signature doesn't match.\n", s, __func__); free(s, M_TCPLOG); } TCPSTAT_INC(tcps_sig_err_sigopt); return (-1); /* Do not send RST */ } #endif /* TCP_SIGNATURE */ } else { #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) /* * If listening socket requested TCP digests, check that * received ACK has signature and it is correct. * If not, drop the ACK and leave sc entry in th cache, * because SYN was received with correct signature. */ if (sc->sc_flags & SCF_SIGNATURE) { if ((to->to_flags & TOF_SIGNATURE) == 0) { /* No signature */ TCPSTAT_INC(tcps_sig_err_nosigopt); SCH_UNLOCK(sch); if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: Segment " "rejected, MD5 signature wasn't " "provided.\n", s, __func__); free(s, M_TCPLOG); } return (-1); /* Do not send RST */ } if (!TCPMD5_ENABLED() || TCPMD5_INPUT(m, th, to->to_signature) != 0) { /* Doesn't match or no SA */ SCH_UNLOCK(sch); if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: Segment " "rejected, MD5 signature doesn't " "match.\n", s, __func__); free(s, M_TCPLOG); } return (-1); /* Do not send RST */ } } #endif /* TCP_SIGNATURE */ /* * Pull out the entry to unlock the bucket row. * * NOTE: We must decrease TCPS_SYN_RECEIVED count here, not * tcp_state_change(). The tcpcb is not existent at this * moment. A new one will be allocated via syncache_socket-> * sonewconn->tcp_usr_attach in TCPS_CLOSED state, then * syncache_socket() will change it to TCPS_SYN_RECEIVED. */ TCPSTATES_DEC(TCPS_SYN_RECEIVED); TAILQ_REMOVE(&sch->sch_bucket, sc, sc_hash); sch->sch_length--; #ifdef TCP_OFFLOAD if (ADDED_BY_TOE(sc)) { struct toedev *tod = sc->sc_tod; tod->tod_syncache_removed(tod, sc->sc_todctx); } #endif SCH_UNLOCK(sch); } /* * Segment validation: * ACK must match our initial sequence number + 1 (the SYN|ACK). */ if (th->th_ack != sc->sc_iss + 1) { if ((s = tcp_log_addrs(inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: ACK %u != ISS+1 %u, segment " "rejected\n", s, __func__, th->th_ack, sc->sc_iss); goto failed; } /* * The SEQ must fall in the window starting at the received * initial receive sequence number + 1 (the SYN). */ if (SEQ_LEQ(th->th_seq, sc->sc_irs) || SEQ_GT(th->th_seq, sc->sc_irs + sc->sc_wnd)) { if ((s = tcp_log_addrs(inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: SEQ %u != IRS+1 %u, segment " "rejected\n", s, __func__, th->th_seq, sc->sc_irs); goto failed; } /* * If timestamps were not negotiated during SYN/ACK they * must not appear on any segment during this session. */ if (!(sc->sc_flags & SCF_TIMESTAMP) && (to->to_flags & TOF_TS)) { if ((s = tcp_log_addrs(inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Timestamp not expected, " "segment rejected\n", s, __func__); goto failed; } /* * If timestamps were negotiated during SYN/ACK they should * appear on every segment during this session. * XXXAO: This is only informal as there have been unverified * reports of non-compliants stacks. */ if ((sc->sc_flags & SCF_TIMESTAMP) && !(to->to_flags & TOF_TS)) { if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: Timestamp missing, " "no action\n", s, __func__); free(s, M_TCPLOG); s = NULL; } } /* * If timestamps were negotiated, the reflected timestamp * must be equal to what we actually sent in the SYN|ACK * except in the case of 0. Some boxes are known for sending * broken timestamp replies during the 3whs (and potentially * during the connection also). * * Accept the final ACK of 3whs with reflected timestamp of 0 * instead of sending a RST and deleting the syncache entry. */ if ((to->to_flags & TOF_TS) && to->to_tsecr && to->to_tsecr != sc->sc_ts) { if ((s = tcp_log_addrs(inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: TSECR %u != TS %u, " "segment rejected\n", s, __func__, to->to_tsecr, sc->sc_ts); goto failed; } *lsop = syncache_socket(sc, *lsop, m); if (*lsop == NULL) TCPSTAT_INC(tcps_sc_aborted); else TCPSTAT_INC(tcps_sc_completed); /* how do we find the inp for the new socket? */ if (sc != &scs) syncache_free(sc); return (1); failed: if (sc != NULL && sc != &scs) syncache_free(sc); if (s != NULL) free(s, M_TCPLOG); *lsop = NULL; return (0); } #ifdef TCP_RFC7413 static void syncache_tfo_expand(struct syncache *sc, struct socket **lsop, struct mbuf *m, uint64_t response_cookie) { struct inpcb *inp; struct tcpcb *tp; unsigned int *pending_counter; /* * Global TCP locks are held because we manipulate the PCB lists * and create a new socket. */ INP_INFO_RLOCK_ASSERT(&V_tcbinfo); pending_counter = intotcpcb(sotoinpcb(*lsop))->t_tfo_pending; *lsop = syncache_socket(sc, *lsop, m); if (*lsop == NULL) { TCPSTAT_INC(tcps_sc_aborted); atomic_subtract_int(pending_counter, 1); } else { inp = sotoinpcb(*lsop); tp = intotcpcb(inp); tp->t_flags |= TF_FASTOPEN; tp->t_tfo_cookie = response_cookie; tp->snd_max = tp->iss; tp->snd_nxt = tp->iss; tp->t_tfo_pending = pending_counter; TCPSTAT_INC(tcps_sc_completed); } } #endif /* TCP_RFC7413 */ /* * Given a LISTEN socket and an inbound SYN request, add * this to the syn cache, and send back a segment: * * to the source. * * IMPORTANT NOTE: We do _NOT_ ACK data that might accompany the SYN. * Doing so would require that we hold onto the data and deliver it * to the application. However, if we are the target of a SYN-flood * DoS attack, an attacker could send data which would eventually * consume all available buffer space if it were ACKed. By not ACKing * the data, we avoid this DoS scenario. * * The exception to the above is when a SYN with a valid TCP Fast Open (TFO) * cookie is processed and a new socket is created. In this case, any data * accompanying the SYN will be queued to the socket by tcp_input() and will * be ACKed either when the application sends response data or the delayed * ACK timer expires, whichever comes first. */ int syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, struct inpcb *inp, struct socket **lsop, struct mbuf *m, void *tod, void *todctx) { struct tcpcb *tp; struct socket *so; struct syncache *sc = NULL; struct syncache_head *sch; struct mbuf *ipopts = NULL; u_int ltflags; int win, ip_ttl, ip_tos; char *s; int rv = 0; #ifdef INET6 int autoflowlabel = 0; #endif #ifdef MAC struct label *maclabel; #endif struct syncache scs; struct ucred *cred; #ifdef TCP_RFC7413 uint64_t tfo_response_cookie; unsigned int *tfo_pending = NULL; int tfo_cookie_valid = 0; int tfo_response_cookie_valid = 0; #endif INP_WLOCK_ASSERT(inp); /* listen socket */ KASSERT((th->th_flags & (TH_RST|TH_ACK|TH_SYN)) == TH_SYN, ("%s: unexpected tcp flags", __func__)); /* * Combine all so/tp operations very early to drop the INP lock as * soon as possible. */ so = *lsop; + KASSERT(SOLISTENING(so), ("%s: %p not listening", __func__, so)); tp = sototcpcb(so); cred = crhold(so->so_cred); #ifdef INET6 if ((inc->inc_flags & INC_ISIPV6) && (inp->inp_flags & IN6P_AUTOFLOWLABEL)) autoflowlabel = 1; #endif ip_ttl = inp->inp_ip_ttl; ip_tos = inp->inp_ip_tos; - win = sbspace(&so->so_rcv); + win = so->sol_sbrcv_hiwat; ltflags = (tp->t_flags & (TF_NOOPT | TF_SIGNATURE)); #ifdef TCP_RFC7413 if (V_tcp_fastopen_enabled && IS_FASTOPEN(tp->t_flags) && (tp->t_tfo_pending != NULL) && (to->to_flags & TOF_FASTOPEN)) { /* * Limit the number of pending TFO connections to * approximately half of the queue limit. This prevents TFO * SYN floods from starving the service by filling the * listen queue with bogus TFO connections. */ if (atomic_fetchadd_int(tp->t_tfo_pending, 1) <= - (so->so_qlimit / 2)) { + (so->sol_qlimit / 2)) { int result; result = tcp_fastopen_check_cookie(inc, to->to_tfo_cookie, to->to_tfo_len, &tfo_response_cookie); tfo_cookie_valid = (result > 0); tfo_response_cookie_valid = (result >= 0); } /* * Remember the TFO pending counter as it will have to be * decremented below if we don't make it to syncache_tfo_expand(). */ tfo_pending = tp->t_tfo_pending; } #endif /* By the time we drop the lock these should no longer be used. */ so = NULL; tp = NULL; #ifdef MAC if (mac_syncache_init(&maclabel) != 0) { INP_WUNLOCK(inp); goto done; } else mac_syncache_create(maclabel, inp); #endif #ifdef TCP_RFC7413 if (!tfo_cookie_valid) #endif INP_WUNLOCK(inp); /* * Remember the IP options, if any. */ #ifdef INET6 if (!(inc->inc_flags & INC_ISIPV6)) #endif #ifdef INET ipopts = (m) ? ip_srcroute(m) : NULL; #else ipopts = NULL; #endif #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) /* * If listening socket requested TCP digests, check that received * SYN has signature and it is correct. If signature doesn't match * or TCP_SIGNATURE support isn't enabled, drop the packet. */ if (ltflags & TF_SIGNATURE) { if ((to->to_flags & TOF_SIGNATURE) == 0) { TCPSTAT_INC(tcps_sig_err_nosigopt); goto done; } if (!TCPMD5_ENABLED() || TCPMD5_INPUT(m, th, to->to_signature) != 0) goto done; } #endif /* TCP_SIGNATURE */ /* * See if we already have an entry for this connection. * If we do, resend the SYN,ACK, and reset the retransmit timer. * * XXX: should the syncache be re-initialized with the contents * of the new SYN here (which may have different options?) * * XXX: We do not check the sequence number to see if this is a * real retransmit or a new connection attempt. The question is * how to handle such a case; either ignore it as spoofed, or * drop the current entry and create a new one? */ sc = syncache_lookup(inc, &sch); /* returns locked entry */ SCH_LOCK_ASSERT(sch); if (sc != NULL) { #ifdef TCP_RFC7413 if (tfo_cookie_valid) INP_WUNLOCK(inp); #endif TCPSTAT_INC(tcps_sc_dupsyn); if (ipopts) { /* * If we were remembering a previous source route, * forget it and use the new one we've been given. */ if (sc->sc_ipopts) (void) m_free(sc->sc_ipopts); sc->sc_ipopts = ipopts; } /* * Update timestamp if present. */ if ((sc->sc_flags & SCF_TIMESTAMP) && (to->to_flags & TOF_TS)) sc->sc_tsreflect = to->to_tsval; else sc->sc_flags &= ~SCF_TIMESTAMP; #ifdef MAC /* * Since we have already unconditionally allocated label * storage, free it up. The syncache entry will already * have an initialized label we can use. */ mac_syncache_destroy(&maclabel); #endif /* Retransmit SYN|ACK and reset retransmit count. */ if ((s = tcp_log_addrs(&sc->sc_inc, th, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: Received duplicate SYN, " "resetting timer and retransmitting SYN|ACK\n", s, __func__); free(s, M_TCPLOG); } if (syncache_respond(sc, sch, 1, m) == 0) { sc->sc_rxmits = 0; syncache_timeout(sc, sch, 1); TCPSTAT_INC(tcps_sndacks); TCPSTAT_INC(tcps_sndtotal); } SCH_UNLOCK(sch); goto done; } #ifdef TCP_RFC7413 if (tfo_cookie_valid) { bzero(&scs, sizeof(scs)); sc = &scs; goto skip_alloc; } #endif sc = uma_zalloc(V_tcp_syncache.zone, M_NOWAIT | M_ZERO); if (sc == NULL) { /* * The zone allocator couldn't provide more entries. * Treat this as if the cache was full; drop the oldest * entry and insert the new one. */ TCPSTAT_INC(tcps_sc_zonefail); if ((sc = TAILQ_LAST(&sch->sch_bucket, sch_head)) != NULL) { sch->sch_last_overflow = time_uptime; syncache_drop(sc, sch); } sc = uma_zalloc(V_tcp_syncache.zone, M_NOWAIT | M_ZERO); if (sc == NULL) { if (V_tcp_syncookies) { bzero(&scs, sizeof(scs)); sc = &scs; } else { SCH_UNLOCK(sch); if (ipopts) (void) m_free(ipopts); goto done; } } } #ifdef TCP_RFC7413 skip_alloc: if (!tfo_cookie_valid && tfo_response_cookie_valid) sc->sc_tfo_cookie = &tfo_response_cookie; #endif /* * Fill in the syncache values. */ #ifdef MAC sc->sc_label = maclabel; #endif sc->sc_cred = cred; cred = NULL; sc->sc_ipopts = ipopts; bcopy(inc, &sc->sc_inc, sizeof(struct in_conninfo)); #ifdef INET6 if (!(inc->inc_flags & INC_ISIPV6)) #endif { sc->sc_ip_tos = ip_tos; sc->sc_ip_ttl = ip_ttl; } #ifdef TCP_OFFLOAD sc->sc_tod = tod; sc->sc_todctx = todctx; #endif sc->sc_irs = th->th_seq; sc->sc_iss = arc4random(); sc->sc_flags = 0; sc->sc_flowlabel = 0; /* * Initial receive window: clip sbspace to [0 .. TCP_MAXWIN]. * win was derived from socket earlier in the function. */ win = imax(win, 0); win = imin(win, TCP_MAXWIN); sc->sc_wnd = win; if (V_tcp_do_rfc1323) { /* * A timestamp received in a SYN makes * it ok to send timestamp requests and replies. */ if (to->to_flags & TOF_TS) { sc->sc_tsreflect = to->to_tsval; sc->sc_ts = tcp_ts_getticks(); sc->sc_flags |= SCF_TIMESTAMP; } if (to->to_flags & TOF_SCALE) { int wscale = 0; /* * Pick the smallest possible scaling factor that * will still allow us to scale up to sb_max, aka * kern.ipc.maxsockbuf. * * We do this because there are broken firewalls that * will corrupt the window scale option, leading to * the other endpoint believing that our advertised * window is unscaled. At scale factors larger than * 5 the unscaled window will drop below 1500 bytes, * leading to serious problems when traversing these * broken firewalls. * * With the default maxsockbuf of 256K, a scale factor * of 3 will be chosen by this algorithm. Those who * choose a larger maxsockbuf should watch out * for the compatibility problems mentioned above. * * RFC1323: The Window field in a SYN (i.e., a * or ) segment itself is never scaled. */ while (wscale < TCP_MAX_WINSHIFT && (TCP_MAXWIN << wscale) < sb_max) wscale++; sc->sc_requested_r_scale = wscale; sc->sc_requested_s_scale = to->to_wscale; sc->sc_flags |= SCF_WINSCALE; } } #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) /* * If listening socket requested TCP digests, flag this in the * syncache so that syncache_respond() will do the right thing * with the SYN+ACK. */ if (ltflags & TF_SIGNATURE) sc->sc_flags |= SCF_SIGNATURE; #endif /* TCP_SIGNATURE */ if (to->to_flags & TOF_SACKPERM) sc->sc_flags |= SCF_SACK; if (to->to_flags & TOF_MSS) sc->sc_peer_mss = to->to_mss; /* peer mss may be zero */ if (ltflags & TF_NOOPT) sc->sc_flags |= SCF_NOOPT; if ((th->th_flags & (TH_ECE|TH_CWR)) && V_tcp_do_ecn) sc->sc_flags |= SCF_ECN; if (V_tcp_syncookies) sc->sc_iss = syncookie_generate(sch, sc); #ifdef INET6 if (autoflowlabel) { if (V_tcp_syncookies) sc->sc_flowlabel = sc->sc_iss; else sc->sc_flowlabel = ip6_randomflowlabel(); sc->sc_flowlabel = htonl(sc->sc_flowlabel) & IPV6_FLOWLABEL_MASK; } #endif SCH_UNLOCK(sch); #ifdef TCP_RFC7413 if (tfo_cookie_valid) { syncache_tfo_expand(sc, lsop, m, tfo_response_cookie); /* INP_WUNLOCK(inp) will be performed by the caller */ rv = 1; goto tfo_expanded; } #endif /* * Do a standard 3-way handshake. */ if (syncache_respond(sc, sch, 0, m) == 0) { if (V_tcp_syncookies && V_tcp_syncookiesonly && sc != &scs) syncache_free(sc); else if (sc != &scs) syncache_insert(sc, sch); /* locks and unlocks sch */ TCPSTAT_INC(tcps_sndacks); TCPSTAT_INC(tcps_sndtotal); } else { if (sc != &scs) syncache_free(sc); TCPSTAT_INC(tcps_sc_dropped); } done: if (m) { *lsop = NULL; m_freem(m); } #ifdef TCP_RFC7413 /* * If tfo_pending is not NULL here, then a TFO SYN that did not * result in a new socket was processed and the associated pending * counter has not yet been decremented. All such TFO processing paths * transit this point. */ if (tfo_pending != NULL) tcp_fastopen_decrement_counter(tfo_pending); tfo_expanded: #endif if (cred != NULL) crfree(cred); #ifdef MAC if (sc == &scs) mac_syncache_destroy(&maclabel); #endif return (rv); } /* * Send SYN|ACK to the peer. Either in response to the peer's SYN, * i.e. m0 != NULL, or upon 3WHS ACK timeout, i.e. m0 == NULL. */ static int syncache_respond(struct syncache *sc, struct syncache_head *sch, int locked, const struct mbuf *m0) { struct ip *ip = NULL; struct mbuf *m; struct tcphdr *th = NULL; int optlen, error = 0; /* Make compiler happy */ u_int16_t hlen, tlen, mssopt; struct tcpopt to; #ifdef INET6 struct ip6_hdr *ip6 = NULL; #endif hlen = #ifdef INET6 (sc->sc_inc.inc_flags & INC_ISIPV6) ? sizeof(struct ip6_hdr) : #endif sizeof(struct ip); tlen = hlen + sizeof(struct tcphdr); /* Determine MSS we advertize to other end of connection. */ mssopt = tcp_mssopt(&sc->sc_inc); if (sc->sc_peer_mss) mssopt = max( min(sc->sc_peer_mss, mssopt), V_tcp_minmss); /* XXX: Assume that the entire packet will fit in a header mbuf. */ KASSERT(max_linkhdr + tlen + TCP_MAXOLEN <= MHLEN, ("syncache: mbuf too small")); /* Create the IP+TCP header from scratch. */ m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) return (ENOBUFS); #ifdef MAC mac_syncache_create_mbuf(sc->sc_label, m); #endif m->m_data += max_linkhdr; m->m_len = tlen; m->m_pkthdr.len = tlen; m->m_pkthdr.rcvif = NULL; #ifdef INET6 if (sc->sc_inc.inc_flags & INC_ISIPV6) { ip6 = mtod(m, struct ip6_hdr *); ip6->ip6_vfc = IPV6_VERSION; ip6->ip6_nxt = IPPROTO_TCP; ip6->ip6_src = sc->sc_inc.inc6_laddr; ip6->ip6_dst = sc->sc_inc.inc6_faddr; ip6->ip6_plen = htons(tlen - hlen); /* ip6_hlim is set after checksum */ ip6->ip6_flow &= ~IPV6_FLOWLABEL_MASK; ip6->ip6_flow |= sc->sc_flowlabel; th = (struct tcphdr *)(ip6 + 1); } #endif #if defined(INET6) && defined(INET) else #endif #ifdef INET { ip = mtod(m, struct ip *); ip->ip_v = IPVERSION; ip->ip_hl = sizeof(struct ip) >> 2; ip->ip_len = htons(tlen); ip->ip_id = 0; ip->ip_off = 0; ip->ip_sum = 0; ip->ip_p = IPPROTO_TCP; ip->ip_src = sc->sc_inc.inc_laddr; ip->ip_dst = sc->sc_inc.inc_faddr; ip->ip_ttl = sc->sc_ip_ttl; ip->ip_tos = sc->sc_ip_tos; /* * See if we should do MTU discovery. Route lookups are * expensive, so we will only unset the DF bit if: * * 1) path_mtu_discovery is disabled * 2) the SCF_UNREACH flag has been set */ if (V_path_mtu_discovery && ((sc->sc_flags & SCF_UNREACH) == 0)) ip->ip_off |= htons(IP_DF); th = (struct tcphdr *)(ip + 1); } #endif /* INET */ th->th_sport = sc->sc_inc.inc_lport; th->th_dport = sc->sc_inc.inc_fport; th->th_seq = htonl(sc->sc_iss); th->th_ack = htonl(sc->sc_irs + 1); th->th_off = sizeof(struct tcphdr) >> 2; th->th_x2 = 0; th->th_flags = TH_SYN|TH_ACK; th->th_win = htons(sc->sc_wnd); th->th_urp = 0; if (sc->sc_flags & SCF_ECN) { th->th_flags |= TH_ECE; TCPSTAT_INC(tcps_ecn_shs); } /* Tack on the TCP options. */ if ((sc->sc_flags & SCF_NOOPT) == 0) { to.to_flags = 0; to.to_mss = mssopt; to.to_flags = TOF_MSS; if (sc->sc_flags & SCF_WINSCALE) { to.to_wscale = sc->sc_requested_r_scale; to.to_flags |= TOF_SCALE; } if (sc->sc_flags & SCF_TIMESTAMP) { /* Virgin timestamp or TCP cookie enhanced one. */ to.to_tsval = sc->sc_ts; to.to_tsecr = sc->sc_tsreflect; to.to_flags |= TOF_TS; } if (sc->sc_flags & SCF_SACK) to.to_flags |= TOF_SACKPERM; #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) if (sc->sc_flags & SCF_SIGNATURE) to.to_flags |= TOF_SIGNATURE; #endif #ifdef TCP_RFC7413 if (sc->sc_tfo_cookie) { to.to_flags |= TOF_FASTOPEN; to.to_tfo_len = TCP_FASTOPEN_COOKIE_LEN; to.to_tfo_cookie = sc->sc_tfo_cookie; /* don't send cookie again when retransmitting response */ sc->sc_tfo_cookie = NULL; } #endif optlen = tcp_addoptions(&to, (u_char *)(th + 1)); /* Adjust headers by option size. */ th->th_off = (sizeof(struct tcphdr) + optlen) >> 2; m->m_len += optlen; m->m_pkthdr.len += optlen; #ifdef INET6 if (sc->sc_inc.inc_flags & INC_ISIPV6) ip6->ip6_plen = htons(ntohs(ip6->ip6_plen) + optlen); else #endif ip->ip_len = htons(ntohs(ip->ip_len) + optlen); #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) if (sc->sc_flags & SCF_SIGNATURE) { KASSERT(to.to_flags & TOF_SIGNATURE, ("tcp_addoptions() didn't set tcp_signature")); /* NOTE: to.to_signature is inside of mbuf */ if (!TCPMD5_ENABLED() || TCPMD5_OUTPUT(m, th, to.to_signature) != 0) { m_freem(m); return (EACCES); } } #endif } else optlen = 0; M_SETFIB(m, sc->sc_inc.inc_fibnum); m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); /* * If we have peer's SYN and it has a flowid, then let's assign it to * our SYN|ACK. ip6_output() and ip_output() will not assign flowid * to SYN|ACK due to lack of inp here. */ if (m0 != NULL && M_HASHTYPE_GET(m0) != M_HASHTYPE_NONE) { m->m_pkthdr.flowid = m0->m_pkthdr.flowid; M_HASHTYPE_SET(m, M_HASHTYPE_GET(m0)); } #ifdef INET6 if (sc->sc_inc.inc_flags & INC_ISIPV6) { m->m_pkthdr.csum_flags = CSUM_TCP_IPV6; th->th_sum = in6_cksum_pseudo(ip6, tlen + optlen - hlen, IPPROTO_TCP, 0); ip6->ip6_hlim = in6_selecthlim(NULL, NULL); #ifdef TCP_OFFLOAD if (ADDED_BY_TOE(sc)) { struct toedev *tod = sc->sc_tod; error = tod->tod_syncache_respond(tod, sc->sc_todctx, m); return (error); } #endif error = ip6_output(m, NULL, NULL, 0, NULL, NULL, NULL); } #endif #if defined(INET6) && defined(INET) else #endif #ifdef INET { m->m_pkthdr.csum_flags = CSUM_TCP; th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htons(tlen + optlen - hlen + IPPROTO_TCP)); #ifdef TCP_OFFLOAD if (ADDED_BY_TOE(sc)) { struct toedev *tod = sc->sc_tod; error = tod->tod_syncache_respond(tod, sc->sc_todctx, m); return (error); } #endif error = ip_output(m, sc->sc_ipopts, NULL, 0, NULL, NULL); } #endif return (error); } /* * The purpose of syncookies is to handle spoofed SYN flooding DoS attacks * that exceed the capacity of the syncache by avoiding the storage of any * of the SYNs we receive. Syncookies defend against blind SYN flooding * attacks where the attacker does not have access to our responses. * * Syncookies encode and include all necessary information about the * connection setup within the SYN|ACK that we send back. That way we * can avoid keeping any local state until the ACK to our SYN|ACK returns * (if ever). Normally the syncache and syncookies are running in parallel * with the latter taking over when the former is exhausted. When matching * syncache entry is found the syncookie is ignored. * * The only reliable information persisting the 3WHS is our initial sequence * number ISS of 32 bits. Syncookies embed a cryptographically sufficient * strong hash (MAC) value and a few bits of TCP SYN options in the ISS * of our SYN|ACK. The MAC can be recomputed when the ACK to our SYN|ACK * returns and signifies a legitimate connection if it matches the ACK. * * The available space of 32 bits to store the hash and to encode the SYN * option information is very tight and we should have at least 24 bits for * the MAC to keep the number of guesses by blind spoofing reasonably high. * * SYN option information we have to encode to fully restore a connection: * MSS: is imporant to chose an optimal segment size to avoid IP level * fragmentation along the path. The common MSS values can be encoded * in a 3-bit table. Uncommon values are captured by the next lower value * in the table leading to a slight increase in packetization overhead. * WSCALE: is necessary to allow large windows to be used for high delay- * bandwidth product links. Not scaling the window when it was initially * negotiated is bad for performance as lack of scaling further decreases * the apparent available send window. We only need to encode the WSCALE * we received from the remote end. Our end can be recalculated at any * time. The common WSCALE values can be encoded in a 3-bit table. * Uncommon values are captured by the next lower value in the table * making us under-estimate the available window size halving our * theoretically possible maximum throughput for that connection. * SACK: Greatly assists in packet loss recovery and requires 1 bit. * TIMESTAMP and SIGNATURE is not encoded because they are permanent options * that are included in all segments on a connection. We enable them when * the ACK has them. * * Security of syncookies and attack vectors: * * The MAC is computed over (faddr||laddr||fport||lport||irs||flags||secmod) * together with the gloabl secret to make it unique per connection attempt. * Thus any change of any of those parameters results in a different MAC output * in an unpredictable way unless a collision is encountered. 24 bits of the * MAC are embedded into the ISS. * * To prevent replay attacks two rotating global secrets are updated with a * new random value every 15 seconds. The life-time of a syncookie is thus * 15-30 seconds. * * Vector 1: Attacking the secret. This requires finding a weakness in the * MAC itself or the way it is used here. The attacker can do a chosen plain * text attack by varying and testing the all parameters under his control. * The strength depends on the size and randomness of the secret, and the * cryptographic security of the MAC function. Due to the constant updating * of the secret the attacker has at most 29.999 seconds to find the secret * and launch spoofed connections. After that he has to start all over again. * * Vector 2: Collision attack on the MAC of a single ACK. With a 24 bit MAC * size an average of 4,823 attempts are required for a 50% chance of success * to spoof a single syncookie (birthday collision paradox). However the * attacker is blind and doesn't know if one of his attempts succeeded unless * he has a side channel to interfere success from. A single connection setup * success average of 90% requires 8,790 packets, 99.99% requires 17,578 packets. * This many attempts are required for each one blind spoofed connection. For * every additional spoofed connection he has to launch another N attempts. * Thus for a sustained rate 100 spoofed connections per second approximately * 1,800,000 packets per second would have to be sent. * * NB: The MAC function should be fast so that it doesn't become a CPU * exhaustion attack vector itself. * * References: * RFC4987 TCP SYN Flooding Attacks and Common Mitigations * SYN cookies were first proposed by cryptographer Dan J. Bernstein in 1996 * http://cr.yp.to/syncookies.html (overview) * http://cr.yp.to/syncookies/archive (details) * * * Schematic construction of a syncookie enabled Initial Sequence Number: * 0 1 2 3 * 12345678901234567890123456789012 * |xxxxxxxxxxxxxxxxxxxxxxxxWWWMMMSP| * * x 24 MAC (truncated) * W 3 Send Window Scale index * M 3 MSS index * S 1 SACK permitted * P 1 Odd/even secret */ /* * Distribution and probability of certain MSS values. Those in between are * rounded down to the next lower one. * [An Analysis of TCP Maximum Segment Sizes, S. Alcock and R. Nelson, 2011] * .2% .3% 5% 7% 7% 20% 15% 45% */ static int tcp_sc_msstab[] = { 216, 536, 1200, 1360, 1400, 1440, 1452, 1460 }; /* * Distribution and probability of certain WSCALE values. We have to map the * (send) window scale (shift) option with a range of 0-14 from 4 bits into 3 * bits based on prevalence of certain values. Where we don't have an exact * match for are rounded down to the next lower one letting us under-estimate * the true available window. At the moment this would happen only for the * very uncommon values 3, 5 and those above 8 (more than 16MB socket buffer * and window size). The absence of the WSCALE option (no scaling in either * direction) is encoded with index zero. * [WSCALE values histograms, Allman, 2012] * X 10 10 35 5 6 14 10% by host * X 11 4 5 5 18 49 3% by connections */ static int tcp_sc_wstab[] = { 0, 0, 1, 2, 4, 6, 7, 8 }; /* * Compute the MAC for the SYN cookie. SIPHASH-2-4 is chosen for its speed * and good cryptographic properties. */ static uint32_t syncookie_mac(struct in_conninfo *inc, tcp_seq irs, uint8_t flags, uint8_t *secbits, uintptr_t secmod) { SIPHASH_CTX ctx; uint32_t siphash[2]; SipHash24_Init(&ctx); SipHash_SetKey(&ctx, secbits); switch (inc->inc_flags & INC_ISIPV6) { #ifdef INET case 0: SipHash_Update(&ctx, &inc->inc_faddr, sizeof(inc->inc_faddr)); SipHash_Update(&ctx, &inc->inc_laddr, sizeof(inc->inc_laddr)); break; #endif #ifdef INET6 case INC_ISIPV6: SipHash_Update(&ctx, &inc->inc6_faddr, sizeof(inc->inc6_faddr)); SipHash_Update(&ctx, &inc->inc6_laddr, sizeof(inc->inc6_laddr)); break; #endif } SipHash_Update(&ctx, &inc->inc_fport, sizeof(inc->inc_fport)); SipHash_Update(&ctx, &inc->inc_lport, sizeof(inc->inc_lport)); SipHash_Update(&ctx, &irs, sizeof(irs)); SipHash_Update(&ctx, &flags, sizeof(flags)); SipHash_Update(&ctx, &secmod, sizeof(secmod)); SipHash_Final((u_int8_t *)&siphash, &ctx); return (siphash[0] ^ siphash[1]); } static tcp_seq syncookie_generate(struct syncache_head *sch, struct syncache *sc) { u_int i, mss, secbit, wscale; uint32_t iss, hash; uint8_t *secbits; union syncookie cookie; SCH_LOCK_ASSERT(sch); cookie.cookie = 0; /* Map our computed MSS into the 3-bit index. */ mss = min(tcp_mssopt(&sc->sc_inc), max(sc->sc_peer_mss, V_tcp_minmss)); for (i = nitems(tcp_sc_msstab) - 1; tcp_sc_msstab[i] > mss && i > 0; i--) ; cookie.flags.mss_idx = i; /* * Map the send window scale into the 3-bit index but only if * the wscale option was received. */ if (sc->sc_flags & SCF_WINSCALE) { wscale = sc->sc_requested_s_scale; for (i = nitems(tcp_sc_wstab) - 1; tcp_sc_wstab[i] > wscale && i > 0; i--) ; cookie.flags.wscale_idx = i; } /* Can we do SACK? */ if (sc->sc_flags & SCF_SACK) cookie.flags.sack_ok = 1; /* Which of the two secrets to use. */ secbit = sch->sch_sc->secret.oddeven & 0x1; cookie.flags.odd_even = secbit; secbits = sch->sch_sc->secret.key[secbit]; hash = syncookie_mac(&sc->sc_inc, sc->sc_irs, cookie.cookie, secbits, (uintptr_t)sch); /* * Put the flags into the hash and XOR them to get better ISS number * variance. This doesn't enhance the cryptographic strength and is * done to prevent the 8 cookie bits from showing up directly on the * wire. */ iss = hash & ~0xff; iss |= cookie.cookie ^ (hash >> 24); /* Randomize the timestamp. */ if (sc->sc_flags & SCF_TIMESTAMP) { sc->sc_ts = arc4random(); sc->sc_tsoff = sc->sc_ts - tcp_ts_getticks(); } TCPSTAT_INC(tcps_sc_sendcookie); return (iss); } static struct syncache * syncookie_lookup(struct in_conninfo *inc, struct syncache_head *sch, struct syncache *sc, struct tcphdr *th, struct tcpopt *to, struct socket *lso) { uint32_t hash; uint8_t *secbits; tcp_seq ack, seq; int wnd, wscale = 0; union syncookie cookie; SCH_LOCK_ASSERT(sch); /* * Pull information out of SYN-ACK/ACK and revert sequence number * advances. */ ack = th->th_ack - 1; seq = th->th_seq - 1; /* * Unpack the flags containing enough information to restore the * connection. */ cookie.cookie = (ack & 0xff) ^ (ack >> 24); /* Which of the two secrets to use. */ secbits = sch->sch_sc->secret.key[cookie.flags.odd_even]; hash = syncookie_mac(inc, seq, cookie.cookie, secbits, (uintptr_t)sch); /* The recomputed hash matches the ACK if this was a genuine cookie. */ if ((ack & ~0xff) != (hash & ~0xff)) return (NULL); /* Fill in the syncache values. */ sc->sc_flags = 0; bcopy(inc, &sc->sc_inc, sizeof(struct in_conninfo)); sc->sc_ipopts = NULL; sc->sc_irs = seq; sc->sc_iss = ack; switch (inc->inc_flags & INC_ISIPV6) { #ifdef INET case 0: sc->sc_ip_ttl = sotoinpcb(lso)->inp_ip_ttl; sc->sc_ip_tos = sotoinpcb(lso)->inp_ip_tos; break; #endif #ifdef INET6 case INC_ISIPV6: if (sotoinpcb(lso)->inp_flags & IN6P_AUTOFLOWLABEL) sc->sc_flowlabel = sc->sc_iss & IPV6_FLOWLABEL_MASK; break; #endif } sc->sc_peer_mss = tcp_sc_msstab[cookie.flags.mss_idx]; /* We can simply recompute receive window scale we sent earlier. */ while (wscale < TCP_MAX_WINSHIFT && (TCP_MAXWIN << wscale) < sb_max) wscale++; /* Only use wscale if it was enabled in the orignal SYN. */ if (cookie.flags.wscale_idx > 0) { sc->sc_requested_r_scale = wscale; sc->sc_requested_s_scale = tcp_sc_wstab[cookie.flags.wscale_idx]; sc->sc_flags |= SCF_WINSCALE; } - wnd = sbspace(&lso->so_rcv); + wnd = lso->sol_sbrcv_hiwat; wnd = imax(wnd, 0); wnd = imin(wnd, TCP_MAXWIN); sc->sc_wnd = wnd; if (cookie.flags.sack_ok) sc->sc_flags |= SCF_SACK; if (to->to_flags & TOF_TS) { sc->sc_flags |= SCF_TIMESTAMP; sc->sc_tsreflect = to->to_tsval; sc->sc_ts = to->to_tsecr; sc->sc_tsoff = to->to_tsecr - tcp_ts_getticks(); } if (to->to_flags & TOF_SIGNATURE) sc->sc_flags |= SCF_SIGNATURE; sc->sc_rxmits = 0; TCPSTAT_INC(tcps_sc_recvcookie); return (sc); } #ifdef INVARIANTS static int syncookie_cmp(struct in_conninfo *inc, struct syncache_head *sch, struct syncache *sc, struct tcphdr *th, struct tcpopt *to, struct socket *lso) { struct syncache scs, *scx; char *s; bzero(&scs, sizeof(scs)); scx = syncookie_lookup(inc, sch, &scs, th, to, lso); if ((s = tcp_log_addrs(inc, th, NULL, NULL)) == NULL) return (0); if (scx != NULL) { if (sc->sc_peer_mss != scx->sc_peer_mss) log(LOG_DEBUG, "%s; %s: mss different %i vs %i\n", s, __func__, sc->sc_peer_mss, scx->sc_peer_mss); if (sc->sc_requested_r_scale != scx->sc_requested_r_scale) log(LOG_DEBUG, "%s; %s: rwscale different %i vs %i\n", s, __func__, sc->sc_requested_r_scale, scx->sc_requested_r_scale); if (sc->sc_requested_s_scale != scx->sc_requested_s_scale) log(LOG_DEBUG, "%s; %s: swscale different %i vs %i\n", s, __func__, sc->sc_requested_s_scale, scx->sc_requested_s_scale); if ((sc->sc_flags & SCF_SACK) != (scx->sc_flags & SCF_SACK)) log(LOG_DEBUG, "%s; %s: SACK different\n", s, __func__); } if (s != NULL) free(s, M_TCPLOG); return (0); } #endif /* INVARIANTS */ static void syncookie_reseed(void *arg) { struct tcp_syncache *sc = arg; uint8_t *secbits; int secbit; /* * Reseeding the secret doesn't have to be protected by a lock. * It only must be ensured that the new random values are visible * to all CPUs in a SMP environment. The atomic with release * semantics ensures that. */ secbit = (sc->secret.oddeven & 0x1) ? 0 : 1; secbits = sc->secret.key[secbit]; arc4rand(secbits, SYNCOOKIE_SECRET_SIZE, 0); atomic_add_rel_int(&sc->secret.oddeven, 1); /* Reschedule ourself. */ callout_schedule(&sc->secret.reseed, SYNCOOKIE_LIFETIME * hz); } /* * Exports the syncache entries to userland so that netstat can display * them alongside the other sockets. This function is intended to be * called only from tcp_pcblist. * * Due to concurrency on an active system, the number of pcbs exported * may have no relation to max_pcbs. max_pcbs merely indicates the * amount of space the caller allocated for this function to use. */ int syncache_pcblist(struct sysctl_req *req, int max_pcbs, int *pcbs_exported) { struct xtcpcb xt; struct syncache *sc; struct syncache_head *sch; int count, error, i; for (count = 0, error = 0, i = 0; i < V_tcp_syncache.hashsize; i++) { sch = &V_tcp_syncache.hashbase[i]; SCH_LOCK(sch); TAILQ_FOREACH(sc, &sch->sch_bucket, sc_hash) { if (count >= max_pcbs) { SCH_UNLOCK(sch); goto exit; } if (cr_cansee(req->td->td_ucred, sc->sc_cred) != 0) continue; bzero(&xt, sizeof(xt)); xt.xt_len = sizeof(xt); if (sc->sc_inc.inc_flags & INC_ISIPV6) xt.xt_inp.inp_vflag = INP_IPV6; else xt.xt_inp.inp_vflag = INP_IPV4; bcopy(&sc->sc_inc, &xt.xt_inp.inp_inc, sizeof (struct in_conninfo)); xt.t_state = TCPS_SYN_RECEIVED; xt.xt_inp.xi_socket.xso_protocol = IPPROTO_TCP; xt.xt_inp.xi_socket.xso_len = sizeof (struct xsocket); xt.xt_inp.xi_socket.so_type = SOCK_STREAM; xt.xt_inp.xi_socket.so_state = SS_ISCONNECTING; error = SYSCTL_OUT(req, &xt, sizeof xt); if (error) { SCH_UNLOCK(sch); goto exit; } count++; } SCH_UNLOCK(sch); } exit: *pcbs_exported = count; return error; } Index: head/sys/netinet/tcp_timewait.c =================================================================== --- head/sys/netinet/tcp_timewait.c (revision 319721) +++ head/sys/netinet/tcp_timewait.c (revision 319722) @@ -1,741 +1,739 @@ /*- * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)tcp_subr.c 8.2 (Berkeley) 5/24/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_tcpdebug.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #include #include #include #include #endif #include #include #include #include #include #ifdef INET6 #include #endif #include #ifdef TCPDEBUG #include #endif #ifdef INET6 #include #endif #include #include static VNET_DEFINE(uma_zone_t, tcptw_zone); #define V_tcptw_zone VNET(tcptw_zone) static int maxtcptw; /* * The timed wait queue contains references to each of the TCP sessions * currently in the TIME_WAIT state. The queue pointers, including the * queue pointers in each tcptw structure, are protected using the global * timewait lock, which must be held over queue iteration and modification. * * Rules on tcptw usage: * - a inpcb is always freed _after_ its tcptw * - a tcptw relies on its inpcb reference counting for memory stability * - a tcptw is dereferenceable only while its inpcb is locked */ static VNET_DEFINE(TAILQ_HEAD(, tcptw), twq_2msl); #define V_twq_2msl VNET(twq_2msl) /* Global timewait lock */ static VNET_DEFINE(struct rwlock, tw_lock); #define V_tw_lock VNET(tw_lock) #define TW_LOCK_INIT(tw, d) rw_init_flags(&(tw), (d), 0) #define TW_LOCK_DESTROY(tw) rw_destroy(&(tw)) #define TW_RLOCK(tw) rw_rlock(&(tw)) #define TW_WLOCK(tw) rw_wlock(&(tw)) #define TW_RUNLOCK(tw) rw_runlock(&(tw)) #define TW_WUNLOCK(tw) rw_wunlock(&(tw)) #define TW_LOCK_ASSERT(tw) rw_assert(&(tw), RA_LOCKED) #define TW_RLOCK_ASSERT(tw) rw_assert(&(tw), RA_RLOCKED) #define TW_WLOCK_ASSERT(tw) rw_assert(&(tw), RA_WLOCKED) #define TW_UNLOCK_ASSERT(tw) rw_assert(&(tw), RA_UNLOCKED) static void tcp_tw_2msl_reset(struct tcptw *, int); static void tcp_tw_2msl_stop(struct tcptw *, int); static int tcp_twrespond(struct tcptw *, int); static int tcptw_auto_size(void) { int halfrange; /* * Max out at half the ephemeral port range so that TIME_WAIT * sockets don't tie up too many ephemeral ports. */ if (V_ipport_lastauto > V_ipport_firstauto) halfrange = (V_ipport_lastauto - V_ipport_firstauto) / 2; else halfrange = (V_ipport_firstauto - V_ipport_lastauto) / 2; /* Protect against goofy port ranges smaller than 32. */ return (imin(imax(halfrange, 32), maxsockets / 5)); } static int sysctl_maxtcptw(SYSCTL_HANDLER_ARGS) { int error, new; if (maxtcptw == 0) new = tcptw_auto_size(); else new = maxtcptw; error = sysctl_handle_int(oidp, &new, 0, req); if (error == 0 && req->newptr) if (new >= 32) { maxtcptw = new; uma_zone_set_max(V_tcptw_zone, maxtcptw); } return (error); } SYSCTL_PROC(_net_inet_tcp, OID_AUTO, maxtcptw, CTLTYPE_INT|CTLFLAG_RW, &maxtcptw, 0, sysctl_maxtcptw, "IU", "Maximum number of compressed TCP TIME_WAIT entries"); VNET_DEFINE(int, nolocaltimewait) = 0; #define V_nolocaltimewait VNET(nolocaltimewait) SYSCTL_INT(_net_inet_tcp, OID_AUTO, nolocaltimewait, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(nolocaltimewait), 0, "Do not create compressed TCP TIME_WAIT entries for local connections"); void tcp_tw_zone_change(void) { if (maxtcptw == 0) uma_zone_set_max(V_tcptw_zone, tcptw_auto_size()); } void tcp_tw_init(void) { V_tcptw_zone = uma_zcreate("tcptw", sizeof(struct tcptw), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); TUNABLE_INT_FETCH("net.inet.tcp.maxtcptw", &maxtcptw); if (maxtcptw == 0) uma_zone_set_max(V_tcptw_zone, tcptw_auto_size()); else uma_zone_set_max(V_tcptw_zone, maxtcptw); TAILQ_INIT(&V_twq_2msl); TW_LOCK_INIT(V_tw_lock, "tcptw"); } #ifdef VIMAGE void tcp_tw_destroy(void) { struct tcptw *tw; INP_INFO_RLOCK(&V_tcbinfo); while ((tw = TAILQ_FIRST(&V_twq_2msl)) != NULL) tcp_twclose(tw, 0); INP_INFO_RUNLOCK(&V_tcbinfo); TW_LOCK_DESTROY(V_tw_lock); uma_zdestroy(V_tcptw_zone); } #endif /* * Move a TCP connection into TIME_WAIT state. * tcbinfo is locked. * inp is locked, and is unlocked before returning. */ void tcp_twstart(struct tcpcb *tp) { struct tcptw *tw; struct inpcb *inp = tp->t_inpcb; int acknow; struct socket *so; #ifdef INET6 int isipv6 = inp->inp_inc.inc_flags & INC_ISIPV6; #endif INP_INFO_RLOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(inp); /* A dropped inp should never transition to TIME_WAIT state. */ KASSERT((inp->inp_flags & INP_DROPPED) == 0, ("tcp_twstart: " "(inp->inp_flags & INP_DROPPED) != 0")); if (V_nolocaltimewait) { int error = 0; #ifdef INET6 if (isipv6) error = in6_localaddr(&inp->in6p_faddr); #endif #if defined(INET6) && defined(INET) else #endif #ifdef INET error = in_localip(inp->inp_faddr); #endif if (error) { tp = tcp_close(tp); if (tp != NULL) INP_WUNLOCK(inp); return; } } /* * For use only by DTrace. We do not reference the state * after this point so modifying it in place is not a problem. */ tcp_state_change(tp, TCPS_TIME_WAIT); tw = uma_zalloc(V_tcptw_zone, M_NOWAIT); if (tw == NULL) { /* * Reached limit on total number of TIMEWAIT connections * allowed. Remove a connection from TIMEWAIT queue in LRU * fashion to make room for this connection. * * XXX: Check if it possible to always have enough room * in advance based on guarantees provided by uma_zalloc(). */ tw = tcp_tw_2msl_scan(1); if (tw == NULL) { tp = tcp_close(tp); if (tp != NULL) INP_WUNLOCK(inp); return; } } /* * The tcptw will hold a reference on its inpcb until tcp_twclose * is called */ tw->tw_inpcb = inp; in_pcbref(inp); /* Reference from tw */ /* * Recover last window size sent. */ if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt)) tw->last_win = (tp->rcv_adv - tp->rcv_nxt) >> tp->rcv_scale; else tw->last_win = 0; /* * Set t_recent if timestamps are used on the connection. */ if ((tp->t_flags & (TF_REQ_TSTMP|TF_RCVD_TSTMP|TF_NOOPT)) == (TF_REQ_TSTMP|TF_RCVD_TSTMP)) { tw->t_recent = tp->ts_recent; tw->ts_offset = tp->ts_offset; } else { tw->t_recent = 0; tw->ts_offset = 0; } tw->snd_nxt = tp->snd_nxt; tw->rcv_nxt = tp->rcv_nxt; tw->iss = tp->iss; tw->irs = tp->irs; tw->t_starttime = tp->t_starttime; tw->tw_time = 0; /* XXX * If this code will * be used for fin-wait-2 state also, then we may need * a ts_recent from the last segment. */ acknow = tp->t_flags & TF_ACKNOW; /* * First, discard tcpcb state, which includes stopping its timers and * freeing it. tcp_discardcb() used to also release the inpcb, but * that work is now done in the caller. * * Note: soisdisconnected() call used to be made in tcp_discardcb(), * and might not be needed here any longer. */ tcp_discardcb(tp); so = inp->inp_socket; soisdisconnected(so); tw->tw_cred = crhold(so->so_cred); SOCK_LOCK(so); tw->tw_so_options = so->so_options; SOCK_UNLOCK(so); if (acknow) tcp_twrespond(tw, TH_ACK); inp->inp_ppcb = tw; inp->inp_flags |= INP_TIMEWAIT; TCPSTATES_INC(TCPS_TIME_WAIT); tcp_tw_2msl_reset(tw, 0); /* * If the inpcb owns the sole reference to the socket, then we can * detach and free the socket as it is not needed in time wait. */ if (inp->inp_flags & INP_SOCKREF) { KASSERT(so->so_state & SS_PROTOREF, ("tcp_twstart: !SS_PROTOREF")); inp->inp_flags &= ~INP_SOCKREF; INP_WUNLOCK(inp); - ACCEPT_LOCK(); SOCK_LOCK(so); so->so_state &= ~SS_PROTOREF; sofree(so); } else INP_WUNLOCK(inp); } /* * Returns 1 if the TIME_WAIT state was killed and we should start over, * looking for a pcb in the listen state. Returns 0 otherwise. */ int tcp_twcheck(struct inpcb *inp, struct tcpopt *to __unused, struct tcphdr *th, struct mbuf *m, int tlen) { struct tcptw *tw; int thflags; tcp_seq seq; INP_INFO_RLOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(inp); /* * XXXRW: Time wait state for inpcb has been recycled, but inpcb is * still present. This is undesirable, but temporarily necessary * until we work out how to handle inpcb's who's timewait state has * been removed. */ tw = intotw(inp); if (tw == NULL) goto drop; thflags = th->th_flags; /* * NOTE: for FIN_WAIT_2 (to be added later), * must validate sequence number before accepting RST */ /* * If the segment contains RST: * Drop the segment - see Stevens, vol. 2, p. 964 and * RFC 1337. */ if (thflags & TH_RST) goto drop; #if 0 /* PAWS not needed at the moment */ /* * RFC 1323 PAWS: If we have a timestamp reply on this segment * and it's less than ts_recent, drop it. */ if ((to.to_flags & TOF_TS) != 0 && tp->ts_recent && TSTMP_LT(to.to_tsval, tp->ts_recent)) { if ((thflags & TH_ACK) == 0) goto drop; goto ack; } /* * ts_recent is never updated because we never accept new segments. */ #endif /* * If a new connection request is received * while in TIME_WAIT, drop the old connection * and start over if the sequence numbers * are above the previous ones. */ if ((thflags & TH_SYN) && SEQ_GT(th->th_seq, tw->rcv_nxt)) { tcp_twclose(tw, 0); return (1); } /* * Drop the segment if it does not contain an ACK. */ if ((thflags & TH_ACK) == 0) goto drop; /* * Reset the 2MSL timer if this is a duplicate FIN. */ if (thflags & TH_FIN) { seq = th->th_seq + tlen + (thflags & TH_SYN ? 1 : 0); if (seq + 1 == tw->rcv_nxt) tcp_tw_2msl_reset(tw, 1); } /* * Acknowledge the segment if it has data or is not a duplicate ACK. */ if (thflags != TH_ACK || tlen != 0 || th->th_seq != tw->rcv_nxt || th->th_ack != tw->snd_nxt) tcp_twrespond(tw, TH_ACK); drop: INP_WUNLOCK(inp); m_freem(m); return (0); } void tcp_twclose(struct tcptw *tw, int reuse) { struct socket *so; struct inpcb *inp; /* * At this point, we are in one of two situations: * * (1) We have no socket, just an inpcb<->twtcp pair. We can free * all state. * * (2) We have a socket -- if we own a reference, release it and * notify the socket layer. */ inp = tw->tw_inpcb; KASSERT((inp->inp_flags & INP_TIMEWAIT), ("tcp_twclose: !timewait")); KASSERT(intotw(inp) == tw, ("tcp_twclose: inp_ppcb != tw")); INP_INFO_RLOCK_ASSERT(&V_tcbinfo); /* in_pcbfree() */ INP_WLOCK_ASSERT(inp); tcp_tw_2msl_stop(tw, reuse); inp->inp_ppcb = NULL; in_pcbdrop(inp); so = inp->inp_socket; if (so != NULL) { /* * If there's a socket, handle two cases: first, we own a * strong reference, which we will now release, or we don't * in which case another reference exists (XXXRW: think * about this more), and we don't need to take action. */ if (inp->inp_flags & INP_SOCKREF) { inp->inp_flags &= ~INP_SOCKREF; INP_WUNLOCK(inp); - ACCEPT_LOCK(); SOCK_LOCK(so); KASSERT(so->so_state & SS_PROTOREF, ("tcp_twclose: INP_SOCKREF && !SS_PROTOREF")); so->so_state &= ~SS_PROTOREF; sofree(so); } else { /* * If we don't own the only reference, the socket and * inpcb need to be left around to be handled by * tcp_usr_detach() later. */ INP_WUNLOCK(inp); } } else { /* * The socket has been already cleaned-up for us, only free the * inpcb. */ in_pcbfree(inp); } TCPSTAT_INC(tcps_closed); } static int tcp_twrespond(struct tcptw *tw, int flags) { struct inpcb *inp = tw->tw_inpcb; #if defined(INET6) || defined(INET) struct tcphdr *th = NULL; #endif struct mbuf *m; #ifdef INET struct ip *ip = NULL; #endif u_int hdrlen, optlen; int error = 0; /* Keep compiler happy */ struct tcpopt to; #ifdef INET6 struct ip6_hdr *ip6 = NULL; int isipv6 = inp->inp_inc.inc_flags & INC_ISIPV6; #endif hdrlen = 0; /* Keep compiler happy */ INP_WLOCK_ASSERT(inp); m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) return (ENOBUFS); m->m_data += max_linkhdr; #ifdef MAC mac_inpcb_create_mbuf(inp, m); #endif #ifdef INET6 if (isipv6) { hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); ip6 = mtod(m, struct ip6_hdr *); th = (struct tcphdr *)(ip6 + 1); tcpip_fillheaders(inp, ip6, th); } #endif #if defined(INET6) && defined(INET) else #endif #ifdef INET { hdrlen = sizeof(struct tcpiphdr); ip = mtod(m, struct ip *); th = (struct tcphdr *)(ip + 1); tcpip_fillheaders(inp, ip, th); } #endif to.to_flags = 0; /* * Send a timestamp and echo-reply if both our side and our peer * have sent timestamps in our SYN's and this is not a RST. */ if (tw->t_recent && flags == TH_ACK) { to.to_flags |= TOF_TS; to.to_tsval = tcp_ts_getticks() + tw->ts_offset; to.to_tsecr = tw->t_recent; } optlen = tcp_addoptions(&to, (u_char *)(th + 1)); m->m_len = hdrlen + optlen; m->m_pkthdr.len = m->m_len; KASSERT(max_linkhdr + m->m_len <= MHLEN, ("tcptw: mbuf too small")); th->th_seq = htonl(tw->snd_nxt); th->th_ack = htonl(tw->rcv_nxt); th->th_off = (sizeof(struct tcphdr) + optlen) >> 2; th->th_flags = flags; th->th_win = htons(tw->last_win); m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); #ifdef INET6 if (isipv6) { m->m_pkthdr.csum_flags = CSUM_TCP_IPV6; th->th_sum = in6_cksum_pseudo(ip6, sizeof(struct tcphdr) + optlen, IPPROTO_TCP, 0); ip6->ip6_hlim = in6_selecthlim(inp, NULL); error = ip6_output(m, inp->in6p_outputopts, NULL, (tw->tw_so_options & SO_DONTROUTE), NULL, NULL, inp); } #endif #if defined(INET6) && defined(INET) else #endif #ifdef INET { m->m_pkthdr.csum_flags = CSUM_TCP; th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) + optlen + IPPROTO_TCP)); ip->ip_len = htons(m->m_pkthdr.len); if (V_path_mtu_discovery) ip->ip_off |= htons(IP_DF); error = ip_output(m, inp->inp_options, NULL, ((tw->tw_so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0), NULL, inp); } #endif if (flags & TH_ACK) TCPSTAT_INC(tcps_sndacks); else TCPSTAT_INC(tcps_sndctrl); TCPSTAT_INC(tcps_sndtotal); return (error); } static void tcp_tw_2msl_reset(struct tcptw *tw, int rearm) { INP_INFO_RLOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(tw->tw_inpcb); TW_WLOCK(V_tw_lock); if (rearm) TAILQ_REMOVE(&V_twq_2msl, tw, tw_2msl); tw->tw_time = ticks + 2 * tcp_msl; TAILQ_INSERT_TAIL(&V_twq_2msl, tw, tw_2msl); TW_WUNLOCK(V_tw_lock); } static void tcp_tw_2msl_stop(struct tcptw *tw, int reuse) { struct ucred *cred; struct inpcb *inp; int released; INP_INFO_RLOCK_ASSERT(&V_tcbinfo); TW_WLOCK(V_tw_lock); inp = tw->tw_inpcb; tw->tw_inpcb = NULL; TAILQ_REMOVE(&V_twq_2msl, tw, tw_2msl); cred = tw->tw_cred; tw->tw_cred = NULL; TW_WUNLOCK(V_tw_lock); if (cred != NULL) crfree(cred); released = in_pcbrele_wlocked(inp); KASSERT(!released, ("%s: inp should not be released here", __func__)); if (!reuse) uma_zfree(V_tcptw_zone, tw); TCPSTATES_DEC(TCPS_TIME_WAIT); } struct tcptw * tcp_tw_2msl_scan(int reuse) { struct tcptw *tw; struct inpcb *inp; #ifdef INVARIANTS if (reuse) { /* * Exclusive pcbinfo lock is not required in reuse case even if * two inpcb locks can be acquired simultaneously: * - the inpcb transitioning to TIME_WAIT state in * tcp_tw_start(), * - the inpcb closed by tcp_twclose(). * * It is because only inpcbs in FIN_WAIT2 or CLOSING states can * transition in TIME_WAIT state. Then a pcbcb cannot be in * TIME_WAIT list and transitioning to TIME_WAIT state at same * time. */ INP_INFO_RLOCK_ASSERT(&V_tcbinfo); } #endif for (;;) { TW_RLOCK(V_tw_lock); tw = TAILQ_FIRST(&V_twq_2msl); if (tw == NULL || (!reuse && (tw->tw_time - ticks) > 0)) { TW_RUNLOCK(V_tw_lock); break; } KASSERT(tw->tw_inpcb != NULL, ("%s: tw->tw_inpcb == NULL", __func__)); inp = tw->tw_inpcb; in_pcbref(inp); TW_RUNLOCK(V_tw_lock); if (INP_INFO_TRY_RLOCK(&V_tcbinfo)) { INP_WLOCK(inp); tw = intotw(inp); if (in_pcbrele_wlocked(inp)) { KASSERT(tw == NULL, ("%s: held last inp " "reference but tw not NULL", __func__)); INP_INFO_RUNLOCK(&V_tcbinfo); continue; } if (tw == NULL) { /* tcp_twclose() has already been called */ INP_WUNLOCK(inp); INP_INFO_RUNLOCK(&V_tcbinfo); continue; } tcp_twclose(tw, reuse); INP_INFO_RUNLOCK(&V_tcbinfo); if (reuse) return tw; } else { /* INP_INFO lock is busy, continue later. */ INP_WLOCK(inp); if (!in_pcbrele_wlocked(inp)) INP_WUNLOCK(inp); break; } } return NULL; } Index: head/sys/ofed/drivers/infiniband/core/iwcm.c =================================================================== --- head/sys/ofed/drivers/infiniband/core/iwcm.c (revision 319721) +++ head/sys/ofed/drivers/infiniband/core/iwcm.c (revision 319722) @@ -1,1316 +1,1285 @@ /* * Copyright (c) 2004, 2005 Intel Corporation. All rights reserved. * Copyright (c) 2004 Topspin Corporation. All rights reserved. * Copyright (c) 2004, 2005 Voltaire Corporation. All rights reserved. * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved. * Copyright (c) 2005 Network Appliance, Inc. All rights reserved. * Copyright (c) 2016 Chelsio Communications. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * */ #include "opt_inet.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "iwcm.h" MODULE_AUTHOR("Tom Tucker"); MODULE_DESCRIPTION("iWARP CM"); MODULE_LICENSE("Dual BSD/GPL"); static struct workqueue_struct *iwcm_wq; struct iwcm_work { struct work_struct work; struct iwcm_id_private *cm_id; struct list_head list; struct iw_cm_event event; struct list_head free_list; }; struct iwcm_listen_work { struct work_struct work; struct iw_cm_id *cm_id; }; static LIST_HEAD(listen_port_list); static DEFINE_MUTEX(listen_port_mutex); struct listen_port_info { struct list_head list; uint16_t port_num; uint32_t refcnt; }; static int32_t add_port_to_listenlist(uint16_t port) { struct listen_port_info *port_info; int err = 0; mutex_lock(&listen_port_mutex); list_for_each_entry(port_info, &listen_port_list, list) if (port_info->port_num == port) goto found_port; port_info = kmalloc(sizeof(*port_info), GFP_KERNEL); if (!port_info) { err = -ENOMEM; mutex_unlock(&listen_port_mutex); goto out; } port_info->port_num = port; port_info->refcnt = 0; list_add(&port_info->list, &listen_port_list); found_port: ++(port_info->refcnt); mutex_unlock(&listen_port_mutex); return port_info->refcnt; out: return err; } static int32_t rem_port_from_listenlist(uint16_t port) { struct listen_port_info *port_info; int ret, found_port = 0; mutex_lock(&listen_port_mutex); list_for_each_entry(port_info, &listen_port_list, list) if (port_info->port_num == port) { found_port = 1; break; } if (found_port) { --(port_info->refcnt); ret = port_info->refcnt; if (port_info->refcnt == 0) { /* Remove this entry from the list as there are no * more listeners for this port_num. */ list_del(&port_info->list); kfree(port_info); } } else { ret = -EINVAL; } mutex_unlock(&listen_port_mutex); return ret; } /* * The following services provide a mechanism for pre-allocating iwcm_work * elements. The design pre-allocates them based on the cm_id type: * LISTENING IDS: Get enough elements preallocated to handle the * listen backlog. * ACTIVE IDS: 4: CONNECT_REPLY, ESTABLISHED, DISCONNECT, CLOSE * PASSIVE IDS: 3: ESTABLISHED, DISCONNECT, CLOSE * * Allocating them in connect and listen avoids having to deal * with allocation failures on the event upcall from the provider (which * is called in the interrupt context). * * One exception is when creating the cm_id for incoming connection requests. * There are two cases: * 1) in the event upcall, cm_event_handler(), for a listening cm_id. If * the backlog is exceeded, then no more connection request events will * be processed. cm_event_handler() returns -ENOMEM in this case. Its up * to the provider to reject the connection request. * 2) in the connection request workqueue handler, cm_conn_req_handler(). * If work elements cannot be allocated for the new connect request cm_id, * then IWCM will call the provider reject method. This is ok since * cm_conn_req_handler() runs in the workqueue thread context. */ static struct iwcm_work *get_work(struct iwcm_id_private *cm_id_priv) { struct iwcm_work *work; if (list_empty(&cm_id_priv->work_free_list)) return NULL; work = list_entry(cm_id_priv->work_free_list.next, struct iwcm_work, free_list); list_del_init(&work->free_list); return work; } static void put_work(struct iwcm_work *work) { list_add(&work->free_list, &work->cm_id->work_free_list); } static void dealloc_work_entries(struct iwcm_id_private *cm_id_priv) { struct list_head *e, *tmp; list_for_each_safe(e, tmp, &cm_id_priv->work_free_list) kfree(list_entry(e, struct iwcm_work, free_list)); } static int alloc_work_entries(struct iwcm_id_private *cm_id_priv, int count) { struct iwcm_work *work; BUG_ON(!list_empty(&cm_id_priv->work_free_list)); while (count--) { work = kmalloc(sizeof(struct iwcm_work), GFP_KERNEL); if (!work) { dealloc_work_entries(cm_id_priv); return -ENOMEM; } work->cm_id = cm_id_priv; INIT_LIST_HEAD(&work->list); put_work(work); } return 0; } /* * Save private data from incoming connection requests to * iw_cm_event, so the low level driver doesn't have to. Adjust * the event ptr to point to the local copy. */ static int copy_private_data(struct iw_cm_event *event) { void *p; p = kmemdup(event->private_data, event->private_data_len, GFP_ATOMIC); if (!p) return -ENOMEM; event->private_data = p; return 0; } static void free_cm_id(struct iwcm_id_private *cm_id_priv) { dealloc_work_entries(cm_id_priv); kfree(cm_id_priv); } /* * Release a reference on cm_id. If the last reference is being * released, enable the waiting thread (in iw_destroy_cm_id) to * get woken up, and return 1 if a thread is already waiting. */ static int iwcm_deref_id(struct iwcm_id_private *cm_id_priv) { BUG_ON(atomic_read(&cm_id_priv->refcount)==0); if (atomic_dec_and_test(&cm_id_priv->refcount)) { BUG_ON(!list_empty(&cm_id_priv->work_list)); complete(&cm_id_priv->destroy_comp); return 1; } return 0; } static void add_ref(struct iw_cm_id *cm_id) { struct iwcm_id_private *cm_id_priv; cm_id_priv = container_of(cm_id, struct iwcm_id_private, id); atomic_inc(&cm_id_priv->refcount); } static void rem_ref(struct iw_cm_id *cm_id) { struct iwcm_id_private *cm_id_priv; int cb_destroy; cm_id_priv = container_of(cm_id, struct iwcm_id_private, id); /* * Test bit before deref in case the cm_id gets freed on another * thread. */ cb_destroy = test_bit(IWCM_F_CALLBACK_DESTROY, &cm_id_priv->flags); if (iwcm_deref_id(cm_id_priv) && cb_destroy) { BUG_ON(!list_empty(&cm_id_priv->work_list)); free_cm_id(cm_id_priv); } } static int cm_event_handler(struct iw_cm_id *cm_id, struct iw_cm_event *event); struct iw_cm_id *iw_create_cm_id(struct ib_device *device, struct socket *so, iw_cm_handler cm_handler, void *context) { struct iwcm_id_private *cm_id_priv; cm_id_priv = kzalloc(sizeof(*cm_id_priv), GFP_KERNEL); if (!cm_id_priv) return ERR_PTR(-ENOMEM); cm_id_priv->state = IW_CM_STATE_IDLE; cm_id_priv->id.device = device; cm_id_priv->id.cm_handler = cm_handler; cm_id_priv->id.context = context; cm_id_priv->id.event_handler = cm_event_handler; cm_id_priv->id.add_ref = add_ref; cm_id_priv->id.rem_ref = rem_ref; cm_id_priv->id.so = so; spin_lock_init(&cm_id_priv->lock); atomic_set(&cm_id_priv->refcount, 1); init_waitqueue_head(&cm_id_priv->connect_wait); init_completion(&cm_id_priv->destroy_comp); INIT_LIST_HEAD(&cm_id_priv->work_list); INIT_LIST_HEAD(&cm_id_priv->work_free_list); return &cm_id_priv->id; } EXPORT_SYMBOL(iw_create_cm_id); static int iwcm_modify_qp_err(struct ib_qp *qp) { struct ib_qp_attr qp_attr; if (!qp) return -EINVAL; qp_attr.qp_state = IB_QPS_ERR; return ib_modify_qp(qp, &qp_attr, IB_QP_STATE); } /* * This is really the RDMAC CLOSING state. It is most similar to the * IB SQD QP state. */ static int iwcm_modify_qp_sqd(struct ib_qp *qp) { struct ib_qp_attr qp_attr; BUG_ON(qp == NULL); qp_attr.qp_state = IB_QPS_SQD; return ib_modify_qp(qp, &qp_attr, IB_QP_STATE); } /* * CM_ID <-- CLOSING * * Block if a passive or active connection is currently being processed. Then * process the event as follows: * - If we are ESTABLISHED, move to CLOSING and modify the QP state * based on the abrupt flag * - If the connection is already in the CLOSING or IDLE state, the peer is * disconnecting concurrently with us and we've already seen the * DISCONNECT event -- ignore the request and return 0 * - Disconnect on a listening endpoint returns -EINVAL */ int iw_cm_disconnect(struct iw_cm_id *cm_id, int abrupt) { struct iwcm_id_private *cm_id_priv; unsigned long flags; int ret = 0; struct ib_qp *qp = NULL; cm_id_priv = container_of(cm_id, struct iwcm_id_private, id); /* Wait if we're currently in a connect or accept downcall */ wait_event(cm_id_priv->connect_wait, !test_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags)); spin_lock_irqsave(&cm_id_priv->lock, flags); switch (cm_id_priv->state) { case IW_CM_STATE_ESTABLISHED: cm_id_priv->state = IW_CM_STATE_CLOSING; /* QP could be for user-mode client */ if (cm_id_priv->qp) qp = cm_id_priv->qp; else ret = -EINVAL; break; case IW_CM_STATE_LISTEN: ret = -EINVAL; break; case IW_CM_STATE_CLOSING: /* remote peer closed first */ case IW_CM_STATE_IDLE: /* accept or connect returned !0 */ break; case IW_CM_STATE_CONN_RECV: /* * App called disconnect before/without calling accept after * connect_request event delivered. */ break; case IW_CM_STATE_CONN_SENT: /* Can only get here if wait above fails */ default: BUG(); } spin_unlock_irqrestore(&cm_id_priv->lock, flags); if (qp) { if (abrupt) ret = iwcm_modify_qp_err(qp); else ret = iwcm_modify_qp_sqd(qp); /* * If both sides are disconnecting the QP could * already be in ERR or SQD states */ ret = 0; } return ret; } EXPORT_SYMBOL(iw_cm_disconnect); static struct socket * dequeue_socket(struct socket *head) { struct socket *so; struct sockaddr_in *remote; + int error; - ACCEPT_LOCK(); - so = TAILQ_FIRST(&head->so_comp); - if (!so) { - ACCEPT_UNLOCK(); - return NULL; - } - - SOCK_LOCK(so); - /* - * Before changing the flags on the socket, we have to bump the - * reference count. Otherwise, if the protocol calls sofree(), - * the socket will be released due to a zero refcount. - */ - soref(so); - TAILQ_REMOVE(&head->so_comp, so, so_list); - head->so_qlen--; - so->so_qstate &= ~SQ_COMP; - so->so_head = NULL; - so->so_state |= SS_NBIO; - SOCK_UNLOCK(so); - ACCEPT_UNLOCK(); + SOLISTEN_LOCK(head); + error = solisten_dequeue(head, &so, SOCK_NONBLOCK); + if (error == EWOULDBLOCK) + return (NULL); remote = NULL; soaccept(so, (struct sockaddr **)&remote); free(remote, M_SONAME); return so; } + static void iw_so_event_handler(struct work_struct *_work) { #ifdef INET struct iwcm_listen_work *work = container_of(_work, struct iwcm_listen_work, work); struct iw_cm_id *listen_cm_id = work->cm_id; struct iwcm_id_private *cm_id_priv; struct iw_cm_id *real_cm_id; struct sockaddr_in *local; struct socket *so; cm_id_priv = container_of(listen_cm_id, struct iwcm_id_private, id); if (cm_id_priv->state != IW_CM_STATE_LISTEN) { kfree(work); return; } /* Dequeue & process all new 'so' connection requests for this cmid */ while ((so = dequeue_socket(work->cm_id->so)) != NULL) { if (rdma_cma_any_addr((struct sockaddr *) &listen_cm_id->local_addr)) { in_getsockaddr(so, (struct sockaddr **)&local); if (rdma_find_cmid_laddr(local, ARPHRD_ETHER, (void **) &real_cm_id)) { free(local, M_SONAME); goto err; } free(local, M_SONAME); real_cm_id->device->iwcm->newconn(real_cm_id, so); } else { listen_cm_id->device->iwcm->newconn(listen_cm_id, so); } } err: kfree(work); #endif return; } + static int iw_so_upcall(struct socket *parent_so, void *arg, int waitflag) { struct iwcm_listen_work *work; - struct socket *so; struct iw_cm_id *cm_id = arg; /* check whether iw_so_event_handler() already dequeued this 'so' */ - so = TAILQ_FIRST(&parent_so->so_comp); - if (!so) + if (TAILQ_EMPTY(&parent_so->sol_comp)) return SU_OK; - work = kzalloc(sizeof(*work), M_NOWAIT); + work = kzalloc(sizeof(*work), waitflag); if (!work) return -ENOMEM; work->cm_id = cm_id; INIT_WORK(&work->work, iw_so_event_handler); queue_work(iwcm_wq, &work->work); return SU_OK; } -static void -iw_init_sock(struct iw_cm_id *cm_id) +static int +iw_create_listen(struct iw_cm_id *cm_id, int backlog) { struct sockopt sopt; struct socket *so = cm_id->so; int on = 1; + int rc; - SOCK_LOCK(so); - soupcall_set(so, SO_RCV, iw_so_upcall, cm_id); + rc = -solisten(cm_id->so, backlog, curthread); + if (rc != 0) + return (rc); + SOLISTEN_LOCK(so); + solisten_upcall_set(so, iw_so_upcall, cm_id); so->so_state |= SS_NBIO; - SOCK_UNLOCK(so); + SOLISTEN_UNLOCK(so); sopt.sopt_dir = SOPT_SET; sopt.sopt_level = IPPROTO_TCP; sopt.sopt_name = TCP_NODELAY; sopt.sopt_val = (caddr_t)&on; sopt.sopt_valsize = sizeof(on); sopt.sopt_td = NULL; sosetopt(so, &sopt); -} - -static int -iw_uninit_socket(struct iw_cm_id *cm_id) -{ - struct socket *so = cm_id->so; - - SOCK_LOCK(so); - soupcall_clear(so, SO_RCV); - SOCK_UNLOCK(so); - return (0); } static int -iw_create_listen(struct iw_cm_id *cm_id, int backlog) -{ - int rc; - - iw_init_sock(cm_id); - rc = -solisten(cm_id->so, backlog, curthread); - if (rc != 0) - iw_uninit_socket(cm_id); - return (rc); -} - -static int iw_destroy_listen(struct iw_cm_id *cm_id) { + struct socket *so = cm_id->so; - return (iw_uninit_socket(cm_id)); + SOLISTEN_LOCK(so); + solisten_upcall_set(so, NULL, NULL); + SOLISTEN_UNLOCK(so); + return (0); } /* * CM_ID <-- DESTROYING * * Clean up all resources associated with the connection and release * the initial reference taken by iw_create_cm_id. */ static void destroy_cm_id(struct iw_cm_id *cm_id) { struct iwcm_id_private *cm_id_priv; unsigned long flags; int ret = 0, refcnt; cm_id_priv = container_of(cm_id, struct iwcm_id_private, id); /* * Wait if we're currently in a connect or accept downcall. A * listening endpoint should never block here. */ wait_event(cm_id_priv->connect_wait, !test_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags)); spin_lock_irqsave(&cm_id_priv->lock, flags); switch (cm_id_priv->state) { case IW_CM_STATE_LISTEN: cm_id_priv->state = IW_CM_STATE_DESTROYING; spin_unlock_irqrestore(&cm_id_priv->lock, flags); if (rdma_cma_any_addr((struct sockaddr *)&cm_id->local_addr)) { refcnt = rem_port_from_listenlist(cm_id->local_addr.sin_port); if (refcnt == 0) ret = iw_destroy_listen(cm_id); cm_id->device->iwcm->destroy_listen_ep(cm_id); } else { ret = iw_destroy_listen(cm_id); cm_id->device->iwcm->destroy_listen_ep(cm_id); } spin_lock_irqsave(&cm_id_priv->lock, flags); break; case IW_CM_STATE_ESTABLISHED: cm_id_priv->state = IW_CM_STATE_DESTROYING; spin_unlock_irqrestore(&cm_id_priv->lock, flags); /* Abrupt close of the connection */ (void)iwcm_modify_qp_err(cm_id_priv->qp); spin_lock_irqsave(&cm_id_priv->lock, flags); break; case IW_CM_STATE_IDLE: case IW_CM_STATE_CLOSING: cm_id_priv->state = IW_CM_STATE_DESTROYING; break; case IW_CM_STATE_CONN_RECV: /* * App called destroy before/without calling accept after * receiving connection request event notification or * returned non zero from the event callback function. * In either case, must tell the provider to reject. */ cm_id_priv->state = IW_CM_STATE_DESTROYING; spin_unlock_irqrestore(&cm_id_priv->lock, flags); cm_id->device->iwcm->reject(cm_id, NULL, 0); spin_lock_irqsave(&cm_id_priv->lock, flags); break; case IW_CM_STATE_CONN_SENT: case IW_CM_STATE_DESTROYING: default: BUG(); break; } if (cm_id_priv->qp) { cm_id_priv->id.device->iwcm->rem_ref(cm_id_priv->qp); cm_id_priv->qp = NULL; } spin_unlock_irqrestore(&cm_id_priv->lock, flags); (void)iwcm_deref_id(cm_id_priv); } /* * This function is only called by the application thread and cannot * be called by the event thread. The function will wait for all * references to be released on the cm_id and then kfree the cm_id * object. */ void iw_destroy_cm_id(struct iw_cm_id *cm_id) { struct iwcm_id_private *cm_id_priv; cm_id_priv = container_of(cm_id, struct iwcm_id_private, id); BUG_ON(test_bit(IWCM_F_CALLBACK_DESTROY, &cm_id_priv->flags)); destroy_cm_id(cm_id); wait_for_completion(&cm_id_priv->destroy_comp); if (cm_id->so) sock_release(cm_id->so); free_cm_id(cm_id_priv); } EXPORT_SYMBOL(iw_destroy_cm_id); /* * CM_ID <-- LISTEN * * Start listening for connect requests. Generates one CONNECT_REQUEST * event for each inbound connect request. */ int iw_cm_listen(struct iw_cm_id *cm_id, int backlog) { struct iwcm_id_private *cm_id_priv; unsigned long flags; int ret, refcnt; cm_id_priv = container_of(cm_id, struct iwcm_id_private, id); ret = alloc_work_entries(cm_id_priv, backlog); if (ret) return ret; spin_lock_irqsave(&cm_id_priv->lock, flags); switch (cm_id_priv->state) { case IW_CM_STATE_IDLE: cm_id_priv->state = IW_CM_STATE_LISTEN; spin_unlock_irqrestore(&cm_id_priv->lock, flags); if (rdma_cma_any_addr((struct sockaddr *)&cm_id->local_addr)) { refcnt = add_port_to_listenlist(cm_id->local_addr.sin_port); if (refcnt == 1) { ret = iw_create_listen(cm_id, backlog); } else if (refcnt <= 0) { ret = -EINVAL; } else { /* if refcnt > 1, a socket listener created * already. And we need not create socket * listener on other rdma devices/listen cm_id's * due to TOE. That is when a socket listener is * created with INADDR_ANY all registered TOE * devices will get a call to start * hardware listeners. */ } } else { ret = iw_create_listen(cm_id, backlog); } if (!ret) cm_id->device->iwcm->create_listen_ep(cm_id, backlog); else cm_id_priv->state = IW_CM_STATE_IDLE; spin_lock_irqsave(&cm_id_priv->lock, flags); break; default: ret = -EINVAL; } spin_unlock_irqrestore(&cm_id_priv->lock, flags); return ret; } EXPORT_SYMBOL(iw_cm_listen); /* * CM_ID <-- IDLE * * Rejects an inbound connection request. No events are generated. */ int iw_cm_reject(struct iw_cm_id *cm_id, const void *private_data, u8 private_data_len) { struct iwcm_id_private *cm_id_priv; unsigned long flags; int ret; cm_id_priv = container_of(cm_id, struct iwcm_id_private, id); set_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags); spin_lock_irqsave(&cm_id_priv->lock, flags); if (cm_id_priv->state != IW_CM_STATE_CONN_RECV) { spin_unlock_irqrestore(&cm_id_priv->lock, flags); clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags); wake_up_all(&cm_id_priv->connect_wait); return -EINVAL; } cm_id_priv->state = IW_CM_STATE_IDLE; spin_unlock_irqrestore(&cm_id_priv->lock, flags); ret = cm_id->device->iwcm->reject(cm_id, private_data, private_data_len); clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags); wake_up_all(&cm_id_priv->connect_wait); return ret; } EXPORT_SYMBOL(iw_cm_reject); /* * CM_ID <-- ESTABLISHED * * Accepts an inbound connection request and generates an ESTABLISHED * event. Callers of iw_cm_disconnect and iw_destroy_cm_id will block * until the ESTABLISHED event is received from the provider. */ int iw_cm_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *iw_param) { struct iwcm_id_private *cm_id_priv; struct ib_qp *qp; unsigned long flags; int ret; cm_id_priv = container_of(cm_id, struct iwcm_id_private, id); set_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags); spin_lock_irqsave(&cm_id_priv->lock, flags); if (cm_id_priv->state != IW_CM_STATE_CONN_RECV) { spin_unlock_irqrestore(&cm_id_priv->lock, flags); clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags); wake_up_all(&cm_id_priv->connect_wait); return -EINVAL; } /* Get the ib_qp given the QPN */ qp = cm_id->device->iwcm->get_qp(cm_id->device, iw_param->qpn); if (!qp) { spin_unlock_irqrestore(&cm_id_priv->lock, flags); clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags); wake_up_all(&cm_id_priv->connect_wait); return -EINVAL; } cm_id->device->iwcm->add_ref(qp); cm_id_priv->qp = qp; spin_unlock_irqrestore(&cm_id_priv->lock, flags); ret = cm_id->device->iwcm->accept(cm_id, iw_param); if (ret) { /* An error on accept precludes provider events */ BUG_ON(cm_id_priv->state != IW_CM_STATE_CONN_RECV); cm_id_priv->state = IW_CM_STATE_IDLE; spin_lock_irqsave(&cm_id_priv->lock, flags); if (cm_id_priv->qp) { cm_id->device->iwcm->rem_ref(qp); cm_id_priv->qp = NULL; } spin_unlock_irqrestore(&cm_id_priv->lock, flags); clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags); wake_up_all(&cm_id_priv->connect_wait); } return ret; } EXPORT_SYMBOL(iw_cm_accept); /* * Active Side: CM_ID <-- CONN_SENT * * If successful, results in the generation of a CONNECT_REPLY * event. iw_cm_disconnect and iw_cm_destroy will block until the * CONNECT_REPLY event is received from the provider. */ int iw_cm_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *iw_param) { struct iwcm_id_private *cm_id_priv; int ret; unsigned long flags; struct ib_qp *qp; cm_id_priv = container_of(cm_id, struct iwcm_id_private, id); ret = alloc_work_entries(cm_id_priv, 4); if (ret) return ret; set_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags); spin_lock_irqsave(&cm_id_priv->lock, flags); if (cm_id_priv->state != IW_CM_STATE_IDLE) { spin_unlock_irqrestore(&cm_id_priv->lock, flags); clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags); wake_up_all(&cm_id_priv->connect_wait); return -EINVAL; } /* Get the ib_qp given the QPN */ qp = cm_id->device->iwcm->get_qp(cm_id->device, iw_param->qpn); if (!qp) { spin_unlock_irqrestore(&cm_id_priv->lock, flags); clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags); wake_up_all(&cm_id_priv->connect_wait); return -EINVAL; } cm_id->device->iwcm->add_ref(qp); cm_id_priv->qp = qp; cm_id_priv->state = IW_CM_STATE_CONN_SENT; spin_unlock_irqrestore(&cm_id_priv->lock, flags); ret = cm_id->device->iwcm->connect(cm_id, iw_param); if (ret) { spin_lock_irqsave(&cm_id_priv->lock, flags); if (cm_id_priv->qp) { cm_id->device->iwcm->rem_ref(qp); cm_id_priv->qp = NULL; } spin_unlock_irqrestore(&cm_id_priv->lock, flags); BUG_ON(cm_id_priv->state != IW_CM_STATE_CONN_SENT); cm_id_priv->state = IW_CM_STATE_IDLE; clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags); wake_up_all(&cm_id_priv->connect_wait); } return ret; } EXPORT_SYMBOL(iw_cm_connect); /* * Passive Side: new CM_ID <-- CONN_RECV * * Handles an inbound connect request. The function creates a new * iw_cm_id to represent the new connection and inherits the client * callback function and other attributes from the listening parent. * * The work item contains a pointer to the listen_cm_id and the event. The * listen_cm_id contains the client cm_handler, context and * device. These are copied when the device is cloned. The event * contains the new four tuple. * * An error on the child should not affect the parent, so this * function does not return a value. */ static void cm_conn_req_handler(struct iwcm_id_private *listen_id_priv, struct iw_cm_event *iw_event) { unsigned long flags; struct iw_cm_id *cm_id; struct iwcm_id_private *cm_id_priv; int ret; /* * The provider should never generate a connection request * event with a bad status. */ BUG_ON(iw_event->status); cm_id = iw_create_cm_id(listen_id_priv->id.device, iw_event->so, listen_id_priv->id.cm_handler, listen_id_priv->id.context); /* If the cm_id could not be created, ignore the request */ if (IS_ERR(cm_id)) goto out; cm_id->provider_data = iw_event->provider_data; cm_id->local_addr = iw_event->local_addr; cm_id->remote_addr = iw_event->remote_addr; cm_id_priv = container_of(cm_id, struct iwcm_id_private, id); cm_id_priv->state = IW_CM_STATE_CONN_RECV; /* * We could be destroying the listening id. If so, ignore this * upcall. */ spin_lock_irqsave(&listen_id_priv->lock, flags); if (listen_id_priv->state != IW_CM_STATE_LISTEN) { spin_unlock_irqrestore(&listen_id_priv->lock, flags); iw_cm_reject(cm_id, NULL, 0); iw_destroy_cm_id(cm_id); goto out; } spin_unlock_irqrestore(&listen_id_priv->lock, flags); ret = alloc_work_entries(cm_id_priv, 3); if (ret) { iw_cm_reject(cm_id, NULL, 0); iw_destroy_cm_id(cm_id); goto out; } /* Call the client CM handler */ ret = cm_id->cm_handler(cm_id, iw_event); if (ret) { iw_cm_reject(cm_id, NULL, 0); set_bit(IWCM_F_CALLBACK_DESTROY, &cm_id_priv->flags); destroy_cm_id(cm_id); if (atomic_read(&cm_id_priv->refcount)==0) free_cm_id(cm_id_priv); } out: if (iw_event->private_data_len) kfree(iw_event->private_data); } /* * Passive Side: CM_ID <-- ESTABLISHED * * The provider generated an ESTABLISHED event which means that * the MPA negotion has completed successfully and we are now in MPA * FPDU mode. * * This event can only be received in the CONN_RECV state. If the * remote peer closed, the ESTABLISHED event would be received followed * by the CLOSE event. If the app closes, it will block until we wake * it up after processing this event. */ static int cm_conn_est_handler(struct iwcm_id_private *cm_id_priv, struct iw_cm_event *iw_event) { unsigned long flags; int ret; spin_lock_irqsave(&cm_id_priv->lock, flags); /* * We clear the CONNECT_WAIT bit here to allow the callback * function to call iw_cm_disconnect. Calling iw_destroy_cm_id * from a callback handler is not allowed. */ clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags); BUG_ON(cm_id_priv->state != IW_CM_STATE_CONN_RECV); cm_id_priv->state = IW_CM_STATE_ESTABLISHED; spin_unlock_irqrestore(&cm_id_priv->lock, flags); ret = cm_id_priv->id.cm_handler(&cm_id_priv->id, iw_event); wake_up_all(&cm_id_priv->connect_wait); return ret; } /* * Active Side: CM_ID <-- ESTABLISHED * * The app has called connect and is waiting for the established event to * post it's requests to the server. This event will wake up anyone * blocked in iw_cm_disconnect or iw_destroy_id. */ static int cm_conn_rep_handler(struct iwcm_id_private *cm_id_priv, struct iw_cm_event *iw_event) { unsigned long flags; int ret; spin_lock_irqsave(&cm_id_priv->lock, flags); /* * Clear the connect wait bit so a callback function calling * iw_cm_disconnect will not wait and deadlock this thread */ clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags); BUG_ON(cm_id_priv->state != IW_CM_STATE_CONN_SENT); if (iw_event->status == 0) { cm_id_priv->id.local_addr = iw_event->local_addr; cm_id_priv->id.remote_addr = iw_event->remote_addr; cm_id_priv->state = IW_CM_STATE_ESTABLISHED; } else { /* REJECTED or RESET */ cm_id_priv->id.device->iwcm->rem_ref(cm_id_priv->qp); cm_id_priv->qp = NULL; cm_id_priv->state = IW_CM_STATE_IDLE; } spin_unlock_irqrestore(&cm_id_priv->lock, flags); ret = cm_id_priv->id.cm_handler(&cm_id_priv->id, iw_event); if (iw_event->private_data_len) kfree(iw_event->private_data); /* Wake up waiters on connect complete */ wake_up_all(&cm_id_priv->connect_wait); return ret; } /* * CM_ID <-- CLOSING * * If in the ESTABLISHED state, move to CLOSING. */ static void cm_disconnect_handler(struct iwcm_id_private *cm_id_priv, struct iw_cm_event *iw_event) { unsigned long flags; spin_lock_irqsave(&cm_id_priv->lock, flags); if (cm_id_priv->state == IW_CM_STATE_ESTABLISHED) cm_id_priv->state = IW_CM_STATE_CLOSING; spin_unlock_irqrestore(&cm_id_priv->lock, flags); } /* * CM_ID <-- IDLE * * If in the ESTBLISHED or CLOSING states, the QP will have have been * moved by the provider to the ERR state. Disassociate the CM_ID from * the QP, move to IDLE, and remove the 'connected' reference. * * If in some other state, the cm_id was destroyed asynchronously. * This is the last reference that will result in waking up * the app thread blocked in iw_destroy_cm_id. */ static int cm_close_handler(struct iwcm_id_private *cm_id_priv, struct iw_cm_event *iw_event) { unsigned long flags; int ret = 0; spin_lock_irqsave(&cm_id_priv->lock, flags); if (cm_id_priv->qp) { cm_id_priv->id.device->iwcm->rem_ref(cm_id_priv->qp); cm_id_priv->qp = NULL; } switch (cm_id_priv->state) { case IW_CM_STATE_ESTABLISHED: case IW_CM_STATE_CLOSING: cm_id_priv->state = IW_CM_STATE_IDLE; spin_unlock_irqrestore(&cm_id_priv->lock, flags); ret = cm_id_priv->id.cm_handler(&cm_id_priv->id, iw_event); spin_lock_irqsave(&cm_id_priv->lock, flags); break; case IW_CM_STATE_DESTROYING: break; default: BUG(); } spin_unlock_irqrestore(&cm_id_priv->lock, flags); return ret; } static int process_event(struct iwcm_id_private *cm_id_priv, struct iw_cm_event *iw_event) { int ret = 0; switch (iw_event->event) { case IW_CM_EVENT_CONNECT_REQUEST: cm_conn_req_handler(cm_id_priv, iw_event); break; case IW_CM_EVENT_CONNECT_REPLY: ret = cm_conn_rep_handler(cm_id_priv, iw_event); break; case IW_CM_EVENT_ESTABLISHED: ret = cm_conn_est_handler(cm_id_priv, iw_event); break; case IW_CM_EVENT_DISCONNECT: cm_disconnect_handler(cm_id_priv, iw_event); break; case IW_CM_EVENT_CLOSE: ret = cm_close_handler(cm_id_priv, iw_event); break; default: BUG(); } return ret; } /* * Process events on the work_list for the cm_id. If the callback * function requests that the cm_id be deleted, a flag is set in the * cm_id flags to indicate that when the last reference is * removed, the cm_id is to be destroyed. This is necessary to * distinguish between an object that will be destroyed by the app * thread asleep on the destroy_comp list vs. an object destroyed * here synchronously when the last reference is removed. */ static void cm_work_handler(struct work_struct *_work) { struct iwcm_work *work = container_of(_work, struct iwcm_work, work); struct iw_cm_event levent; struct iwcm_id_private *cm_id_priv = work->cm_id; unsigned long flags; int empty; int ret = 0; int destroy_id; spin_lock_irqsave(&cm_id_priv->lock, flags); empty = list_empty(&cm_id_priv->work_list); while (!empty) { work = list_entry(cm_id_priv->work_list.next, struct iwcm_work, list); list_del_init(&work->list); empty = list_empty(&cm_id_priv->work_list); levent = work->event; put_work(work); spin_unlock_irqrestore(&cm_id_priv->lock, flags); ret = process_event(cm_id_priv, &levent); if (ret) { set_bit(IWCM_F_CALLBACK_DESTROY, &cm_id_priv->flags); destroy_cm_id(&cm_id_priv->id); } BUG_ON(atomic_read(&cm_id_priv->refcount)==0); destroy_id = test_bit(IWCM_F_CALLBACK_DESTROY, &cm_id_priv->flags); if (iwcm_deref_id(cm_id_priv)) { if (destroy_id) { BUG_ON(!list_empty(&cm_id_priv->work_list)); free_cm_id(cm_id_priv); } return; } if (empty) return; spin_lock_irqsave(&cm_id_priv->lock, flags); } spin_unlock_irqrestore(&cm_id_priv->lock, flags); } /* * This function is called on interrupt context. Schedule events on * the iwcm_wq thread to allow callback functions to downcall into * the CM and/or block. Events are queued to a per-CM_ID * work_list. If this is the first event on the work_list, the work * element is also queued on the iwcm_wq thread. * * Each event holds a reference on the cm_id. Until the last posted * event has been delivered and processed, the cm_id cannot be * deleted. * * Returns: * 0 - the event was handled. * -ENOMEM - the event was not handled due to lack of resources. */ static int cm_event_handler(struct iw_cm_id *cm_id, struct iw_cm_event *iw_event) { struct iwcm_work *work; struct iwcm_id_private *cm_id_priv; unsigned long flags; int ret = 0; cm_id_priv = container_of(cm_id, struct iwcm_id_private, id); spin_lock_irqsave(&cm_id_priv->lock, flags); work = get_work(cm_id_priv); if (!work) { ret = -ENOMEM; goto out; } INIT_WORK(&work->work, cm_work_handler); work->cm_id = cm_id_priv; work->event = *iw_event; if ((work->event.event == IW_CM_EVENT_CONNECT_REQUEST || work->event.event == IW_CM_EVENT_CONNECT_REPLY) && work->event.private_data_len) { ret = copy_private_data(&work->event); if (ret) { put_work(work); goto out; } } atomic_inc(&cm_id_priv->refcount); if (list_empty(&cm_id_priv->work_list)) { list_add_tail(&work->list, &cm_id_priv->work_list); queue_work(iwcm_wq, &work->work); } else list_add_tail(&work->list, &cm_id_priv->work_list); out: spin_unlock_irqrestore(&cm_id_priv->lock, flags); return ret; } static int iwcm_init_qp_init_attr(struct iwcm_id_private *cm_id_priv, struct ib_qp_attr *qp_attr, int *qp_attr_mask) { unsigned long flags; int ret; spin_lock_irqsave(&cm_id_priv->lock, flags); switch (cm_id_priv->state) { case IW_CM_STATE_IDLE: case IW_CM_STATE_CONN_SENT: case IW_CM_STATE_CONN_RECV: case IW_CM_STATE_ESTABLISHED: *qp_attr_mask = IB_QP_STATE | IB_QP_ACCESS_FLAGS; qp_attr->qp_access_flags = IB_ACCESS_REMOTE_WRITE| IB_ACCESS_REMOTE_READ; ret = 0; break; default: ret = -EINVAL; break; } spin_unlock_irqrestore(&cm_id_priv->lock, flags); return ret; } static int iwcm_init_qp_rts_attr(struct iwcm_id_private *cm_id_priv, struct ib_qp_attr *qp_attr, int *qp_attr_mask) { unsigned long flags; int ret; spin_lock_irqsave(&cm_id_priv->lock, flags); switch (cm_id_priv->state) { case IW_CM_STATE_IDLE: case IW_CM_STATE_CONN_SENT: case IW_CM_STATE_CONN_RECV: case IW_CM_STATE_ESTABLISHED: *qp_attr_mask = 0; ret = 0; break; default: ret = -EINVAL; break; } spin_unlock_irqrestore(&cm_id_priv->lock, flags); return ret; } int iw_cm_init_qp_attr(struct iw_cm_id *cm_id, struct ib_qp_attr *qp_attr, int *qp_attr_mask) { struct iwcm_id_private *cm_id_priv; int ret; cm_id_priv = container_of(cm_id, struct iwcm_id_private, id); switch (qp_attr->qp_state) { case IB_QPS_INIT: case IB_QPS_RTR: ret = iwcm_init_qp_init_attr(cm_id_priv, qp_attr, qp_attr_mask); break; case IB_QPS_RTS: ret = iwcm_init_qp_rts_attr(cm_id_priv, qp_attr, qp_attr_mask); break; default: ret = -EINVAL; break; } return ret; } EXPORT_SYMBOL(iw_cm_init_qp_attr); static int __init iw_cm_init(void) { iwcm_wq = create_singlethread_workqueue("iw_cm_wq"); if (!iwcm_wq) return -ENOMEM; return 0; } static void __exit iw_cm_cleanup(void) { destroy_workqueue(iwcm_wq); } module_init(iw_cm_init); module_exit(iw_cm_cleanup); Index: head/sys/ofed/drivers/infiniband/ulp/sdp/sdp_main.c =================================================================== --- head/sys/ofed/drivers/infiniband/ulp/sdp/sdp_main.c (revision 319721) +++ head/sys/ofed/drivers/infiniband/ulp/sdp/sdp_main.c (revision 319722) @@ -1,1973 +1,1972 @@ /*- * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 * The Regents of the University of California. All rights reserved. * Copyright (c) 2004 The FreeBSD Foundation. All rights reserved. * Copyright (c) 2004-2008 Robert N. M. Watson. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * Excerpts taken from tcp_subr.c, tcp_usrreq.c, uipc_socket.c */ /* * * Copyright (c) 2010 Isilon Systems, Inc. * Copyright (c) 2010 iX Systems, Inc. * Copyright (c) 2010 Panasas, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * */ #include __FBSDID("$FreeBSD$"); #include #include #include #include "sdp.h" #include #include #include #include uma_zone_t sdp_zone; struct rwlock sdp_lock; LIST_HEAD(, sdp_sock) sdp_list; struct workqueue_struct *rx_comp_wq; RW_SYSINIT(sdplockinit, &sdp_lock, "SDP lock"); #define SDP_LIST_WLOCK() rw_wlock(&sdp_lock) #define SDP_LIST_RLOCK() rw_rlock(&sdp_lock) #define SDP_LIST_WUNLOCK() rw_wunlock(&sdp_lock) #define SDP_LIST_RUNLOCK() rw_runlock(&sdp_lock) #define SDP_LIST_WLOCK_ASSERT() rw_assert(&sdp_lock, RW_WLOCKED) #define SDP_LIST_RLOCK_ASSERT() rw_assert(&sdp_lock, RW_RLOCKED) #define SDP_LIST_LOCK_ASSERT() rw_assert(&sdp_lock, RW_LOCKED) MALLOC_DEFINE(M_SDP, "sdp", "Sockets Direct Protocol"); static void sdp_stop_keepalive_timer(struct socket *so); /* * SDP protocol interface to socket abstraction. */ /* * sdp_sendspace and sdp_recvspace are the default send and receive window * sizes, respectively. */ u_long sdp_sendspace = 1024*32; u_long sdp_recvspace = 1024*64; static int sdp_count; /* * Disable async. CMA events for sockets which are being torn down. */ static void sdp_destroy_cma(struct sdp_sock *ssk) { if (ssk->id == NULL) return; rdma_destroy_id(ssk->id); ssk->id = NULL; } static int sdp_pcbbind(struct sdp_sock *ssk, struct sockaddr *nam, struct ucred *cred) { struct sockaddr_in *sin; struct sockaddr_in null; int error; SDP_WLOCK_ASSERT(ssk); if (ssk->lport != 0 || ssk->laddr != INADDR_ANY) return (EINVAL); /* rdma_bind_addr handles bind races. */ SDP_WUNLOCK(ssk); if (ssk->id == NULL) ssk->id = rdma_create_id(sdp_cma_handler, ssk, RDMA_PS_SDP, IB_QPT_RC); if (ssk->id == NULL) { SDP_WLOCK(ssk); return (ENOMEM); } if (nam == NULL) { null.sin_family = AF_INET; null.sin_len = sizeof(null); null.sin_addr.s_addr = INADDR_ANY; null.sin_port = 0; bzero(&null.sin_zero, sizeof(null.sin_zero)); nam = (struct sockaddr *)&null; } error = -rdma_bind_addr(ssk->id, nam); SDP_WLOCK(ssk); if (error == 0) { sin = (struct sockaddr_in *)&ssk->id->route.addr.src_addr; ssk->laddr = sin->sin_addr.s_addr; ssk->lport = sin->sin_port; } else sdp_destroy_cma(ssk); return (error); } static void sdp_pcbfree(struct sdp_sock *ssk) { KASSERT(ssk->socket == NULL, ("ssk %p socket still attached", ssk)); KASSERT((ssk->flags & SDP_DESTROY) == 0, ("ssk %p already destroyed", ssk)); sdp_dbg(ssk->socket, "Freeing pcb"); SDP_WLOCK_ASSERT(ssk); ssk->flags |= SDP_DESTROY; SDP_WUNLOCK(ssk); SDP_LIST_WLOCK(); sdp_count--; LIST_REMOVE(ssk, list); SDP_LIST_WUNLOCK(); crfree(ssk->cred); ssk->qp_active = 0; if (ssk->qp) { ib_destroy_qp(ssk->qp); ssk->qp = NULL; } sdp_tx_ring_destroy(ssk); sdp_rx_ring_destroy(ssk); sdp_destroy_cma(ssk); rw_destroy(&ssk->rx_ring.destroyed_lock); rw_destroy(&ssk->lock); uma_zfree(sdp_zone, ssk); } /* * Common routines to return a socket address. */ static struct sockaddr * sdp_sockaddr(in_port_t port, struct in_addr *addr_p) { struct sockaddr_in *sin; sin = malloc(sizeof *sin, M_SONAME, M_WAITOK | M_ZERO); sin->sin_family = AF_INET; sin->sin_len = sizeof(*sin); sin->sin_addr = *addr_p; sin->sin_port = port; return (struct sockaddr *)sin; } static int sdp_getsockaddr(struct socket *so, struct sockaddr **nam) { struct sdp_sock *ssk; struct in_addr addr; in_port_t port; ssk = sdp_sk(so); SDP_RLOCK(ssk); port = ssk->lport; addr.s_addr = ssk->laddr; SDP_RUNLOCK(ssk); *nam = sdp_sockaddr(port, &addr); return 0; } static int sdp_getpeeraddr(struct socket *so, struct sockaddr **nam) { struct sdp_sock *ssk; struct in_addr addr; in_port_t port; ssk = sdp_sk(so); SDP_RLOCK(ssk); port = ssk->fport; addr.s_addr = ssk->faddr; SDP_RUNLOCK(ssk); *nam = sdp_sockaddr(port, &addr); return 0; } static void sdp_pcbnotifyall(struct in_addr faddr, int errno, struct sdp_sock *(*notify)(struct sdp_sock *, int)) { struct sdp_sock *ssk, *ssk_temp; SDP_LIST_WLOCK(); LIST_FOREACH_SAFE(ssk, &sdp_list, list, ssk_temp) { SDP_WLOCK(ssk); if (ssk->faddr != faddr.s_addr || ssk->socket == NULL) { SDP_WUNLOCK(ssk); continue; } if ((ssk->flags & SDP_DESTROY) == 0) if ((*notify)(ssk, errno)) SDP_WUNLOCK(ssk); } SDP_LIST_WUNLOCK(); } #if 0 static void sdp_apply_all(void (*func)(struct sdp_sock *, void *), void *arg) { struct sdp_sock *ssk; SDP_LIST_RLOCK(); LIST_FOREACH(ssk, &sdp_list, list) { SDP_WLOCK(ssk); func(ssk, arg); SDP_WUNLOCK(ssk); } SDP_LIST_RUNLOCK(); } #endif static void sdp_output_reset(struct sdp_sock *ssk) { struct rdma_cm_id *id; SDP_WLOCK_ASSERT(ssk); if (ssk->id) { id = ssk->id; ssk->qp_active = 0; SDP_WUNLOCK(ssk); rdma_disconnect(id); SDP_WLOCK(ssk); } ssk->state = TCPS_CLOSED; } /* * Attempt to close a SDP socket, marking it as dropped, and freeing * the socket if we hold the only reference. */ static struct sdp_sock * sdp_closed(struct sdp_sock *ssk) { struct socket *so; SDP_WLOCK_ASSERT(ssk); ssk->flags |= SDP_DROPPED; so = ssk->socket; soisdisconnected(so); if (ssk->flags & SDP_SOCKREF) { KASSERT(so->so_state & SS_PROTOREF, ("sdp_closed: !SS_PROTOREF")); ssk->flags &= ~SDP_SOCKREF; SDP_WUNLOCK(ssk); - ACCEPT_LOCK(); SOCK_LOCK(so); so->so_state &= ~SS_PROTOREF; sofree(so); return (NULL); } return (ssk); } /* * Perform timer based shutdowns which can not operate in * callout context. */ static void sdp_shutdown_task(void *data, int pending) { struct sdp_sock *ssk; ssk = data; SDP_WLOCK(ssk); /* * I don't think this can race with another call to pcbfree() * because SDP_TIMEWAIT protects it. SDP_DESTROY may be redundant. */ if (ssk->flags & SDP_DESTROY) panic("sdp_shutdown_task: Racing with pcbfree for ssk %p", ssk); if (ssk->flags & SDP_DISCON) sdp_output_reset(ssk); /* We have to clear this so sdp_detach() will call pcbfree(). */ ssk->flags &= ~(SDP_TIMEWAIT | SDP_DREQWAIT); if ((ssk->flags & SDP_DROPPED) == 0 && sdp_closed(ssk) == NULL) return; if (ssk->socket == NULL) { sdp_pcbfree(ssk); return; } SDP_WUNLOCK(ssk); } /* * 2msl has expired, schedule the shutdown task. */ static void sdp_2msl_timeout(void *data) { struct sdp_sock *ssk; ssk = data; /* Callout canceled. */ if (!callout_active(&ssk->keep2msl)) goto out; callout_deactivate(&ssk->keep2msl); /* Should be impossible, defensive programming. */ if ((ssk->flags & SDP_TIMEWAIT) == 0) goto out; taskqueue_enqueue(taskqueue_thread, &ssk->shutdown_task); out: SDP_WUNLOCK(ssk); return; } /* * Schedule the 2msl wait timer. */ static void sdp_2msl_wait(struct sdp_sock *ssk) { SDP_WLOCK_ASSERT(ssk); ssk->flags |= SDP_TIMEWAIT; ssk->state = TCPS_TIME_WAIT; soisdisconnected(ssk->socket); callout_reset(&ssk->keep2msl, TCPTV_MSL, sdp_2msl_timeout, ssk); } /* * Timed out waiting for the final fin/ack from rdma_disconnect(). */ static void sdp_dreq_timeout(void *data) { struct sdp_sock *ssk; ssk = data; /* Callout canceled. */ if (!callout_active(&ssk->keep2msl)) goto out; /* Callout rescheduled, probably as a different timer. */ if (callout_pending(&ssk->keep2msl)) goto out; callout_deactivate(&ssk->keep2msl); if (ssk->state != TCPS_FIN_WAIT_1 && ssk->state != TCPS_LAST_ACK) goto out; if ((ssk->flags & SDP_DREQWAIT) == 0) goto out; ssk->flags &= ~SDP_DREQWAIT; ssk->flags |= SDP_DISCON; sdp_2msl_wait(ssk); ssk->qp_active = 0; out: SDP_WUNLOCK(ssk); } /* * Received the final fin/ack. Cancel the 2msl. */ void sdp_cancel_dreq_wait_timeout(struct sdp_sock *ssk) { sdp_dbg(ssk->socket, "cancelling dreq wait timeout\n"); ssk->flags &= ~SDP_DREQWAIT; sdp_2msl_wait(ssk); } static int sdp_init_sock(struct socket *sk) { struct sdp_sock *ssk = sdp_sk(sk); sdp_dbg(sk, "%s\n", __func__); callout_init_rw(&ssk->keep2msl, &ssk->lock, CALLOUT_RETURNUNLOCKED); TASK_INIT(&ssk->shutdown_task, 0, sdp_shutdown_task, ssk); #ifdef SDP_ZCOPY INIT_DELAYED_WORK(&ssk->srcavail_cancel_work, srcavail_cancel_timeout); ssk->zcopy_thresh = -1; /* use global sdp_zcopy_thresh */ ssk->tx_ring.rdma_inflight = NULL; #endif atomic_set(&ssk->mseq_ack, 0); sdp_rx_ring_init(ssk); ssk->tx_ring.buffer = NULL; return 0; } /* * Allocate an sdp_sock for the socket and reserve socket buffer space. */ static int sdp_attach(struct socket *so, int proto, struct thread *td) { struct sdp_sock *ssk; int error; ssk = sdp_sk(so); KASSERT(ssk == NULL, ("sdp_attach: ssk already set on so %p", so)); if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) { error = soreserve(so, sdp_sendspace, sdp_recvspace); if (error) return (error); } so->so_rcv.sb_flags |= SB_AUTOSIZE; so->so_snd.sb_flags |= SB_AUTOSIZE; ssk = uma_zalloc(sdp_zone, M_NOWAIT | M_ZERO); if (ssk == NULL) return (ENOBUFS); rw_init(&ssk->lock, "sdpsock"); ssk->socket = so; ssk->cred = crhold(so->so_cred); so->so_pcb = (caddr_t)ssk; sdp_init_sock(so); ssk->flags = 0; ssk->qp_active = 0; ssk->state = TCPS_CLOSED; mbufq_init(&ssk->rxctlq, INT_MAX); SDP_LIST_WLOCK(); LIST_INSERT_HEAD(&sdp_list, ssk, list); sdp_count++; SDP_LIST_WUNLOCK(); if ((so->so_options & SO_LINGER) && so->so_linger == 0) so->so_linger = TCP_LINGERTIME; return (0); } /* * Detach SDP from the socket, potentially leaving it around for the * timewait to expire. */ static void sdp_detach(struct socket *so) { struct sdp_sock *ssk; ssk = sdp_sk(so); SDP_WLOCK(ssk); KASSERT(ssk->socket != NULL, ("sdp_detach: socket is NULL")); ssk->socket->so_pcb = NULL; ssk->socket = NULL; if (ssk->flags & (SDP_TIMEWAIT | SDP_DREQWAIT)) SDP_WUNLOCK(ssk); else if (ssk->flags & SDP_DROPPED || ssk->state < TCPS_SYN_SENT) sdp_pcbfree(ssk); else panic("sdp_detach: Unexpected state, ssk %p.\n", ssk); } /* * Allocate a local address for the socket. */ static int sdp_bind(struct socket *so, struct sockaddr *nam, struct thread *td) { int error = 0; struct sdp_sock *ssk; struct sockaddr_in *sin; sin = (struct sockaddr_in *)nam; if (nam->sa_len != sizeof (*sin)) return (EINVAL); if (sin->sin_family != AF_INET) return (EINVAL); if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) return (EAFNOSUPPORT); ssk = sdp_sk(so); SDP_WLOCK(ssk); if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) { error = EINVAL; goto out; } error = sdp_pcbbind(ssk, nam, td->td_ucred); out: SDP_WUNLOCK(ssk); return (error); } /* * Prepare to accept connections. */ static int sdp_listen(struct socket *so, int backlog, struct thread *td) { int error = 0; struct sdp_sock *ssk; ssk = sdp_sk(so); SDP_WLOCK(ssk); if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) { error = EINVAL; goto out; } if (error == 0 && ssk->lport == 0) error = sdp_pcbbind(ssk, (struct sockaddr *)0, td->td_ucred); SOCK_LOCK(so); if (error == 0) error = solisten_proto_check(so); if (error == 0) { solisten_proto(so, backlog); ssk->state = TCPS_LISTEN; } SOCK_UNLOCK(so); out: SDP_WUNLOCK(ssk); if (error == 0) error = -rdma_listen(ssk->id, backlog); return (error); } /* * Initiate a SDP connection to nam. */ static int sdp_start_connect(struct sdp_sock *ssk, struct sockaddr *nam, struct thread *td) { struct sockaddr_in src; struct socket *so; int error; so = ssk->socket; SDP_WLOCK_ASSERT(ssk); if (ssk->lport == 0) { error = sdp_pcbbind(ssk, (struct sockaddr *)0, td->td_ucred); if (error) return error; } src.sin_family = AF_INET; src.sin_len = sizeof(src); bzero(&src.sin_zero, sizeof(src.sin_zero)); src.sin_port = ssk->lport; src.sin_addr.s_addr = ssk->laddr; soisconnecting(so); SDP_WUNLOCK(ssk); error = -rdma_resolve_addr(ssk->id, (struct sockaddr *)&src, nam, SDP_RESOLVE_TIMEOUT); SDP_WLOCK(ssk); if (error == 0) ssk->state = TCPS_SYN_SENT; return 0; } /* * Initiate SDP connection. */ static int sdp_connect(struct socket *so, struct sockaddr *nam, struct thread *td) { int error = 0; struct sdp_sock *ssk; struct sockaddr_in *sin; sin = (struct sockaddr_in *)nam; if (nam->sa_len != sizeof (*sin)) return (EINVAL); if (sin->sin_family != AF_INET) return (EINVAL); if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) return (EAFNOSUPPORT); if ((error = prison_remote_ip4(td->td_ucred, &sin->sin_addr)) != 0) return (error); ssk = sdp_sk(so); SDP_WLOCK(ssk); if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) error = EINVAL; else error = sdp_start_connect(ssk, nam, td); SDP_WUNLOCK(ssk); return (error); } /* * Drop a SDP socket, reporting * the specified error. If connection is synchronized, * then send a RST to peer. */ static struct sdp_sock * sdp_drop(struct sdp_sock *ssk, int errno) { struct socket *so; SDP_WLOCK_ASSERT(ssk); so = ssk->socket; if (TCPS_HAVERCVDSYN(ssk->state)) sdp_output_reset(ssk); if (errno == ETIMEDOUT && ssk->softerror) errno = ssk->softerror; so->so_error = errno; return (sdp_closed(ssk)); } /* * User issued close, and wish to trail through shutdown states: * if never received SYN, just forget it. If got a SYN from peer, * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN. * If already got a FIN from peer, then almost done; go to LAST_ACK * state. In all other cases, have already sent FIN to peer (e.g. * after PRU_SHUTDOWN), and just have to play tedious game waiting * for peer to send FIN or not respond to keep-alives, etc. * We can let the user exit from the close as soon as the FIN is acked. */ static void sdp_usrclosed(struct sdp_sock *ssk) { SDP_WLOCK_ASSERT(ssk); switch (ssk->state) { case TCPS_LISTEN: ssk->state = TCPS_CLOSED; SDP_WUNLOCK(ssk); sdp_destroy_cma(ssk); SDP_WLOCK(ssk); /* FALLTHROUGH */ case TCPS_CLOSED: ssk = sdp_closed(ssk); /* * sdp_closed() should never return NULL here as the socket is * still open. */ KASSERT(ssk != NULL, ("sdp_usrclosed: sdp_closed() returned NULL")); break; case TCPS_SYN_SENT: /* FALLTHROUGH */ case TCPS_SYN_RECEIVED: ssk->flags |= SDP_NEEDFIN; break; case TCPS_ESTABLISHED: ssk->flags |= SDP_NEEDFIN; ssk->state = TCPS_FIN_WAIT_1; break; case TCPS_CLOSE_WAIT: ssk->state = TCPS_LAST_ACK; break; } if (ssk->state >= TCPS_FIN_WAIT_2) { /* Prevent the connection hanging in FIN_WAIT_2 forever. */ if (ssk->state == TCPS_FIN_WAIT_2) sdp_2msl_wait(ssk); else soisdisconnected(ssk->socket); } } static void sdp_output_disconnect(struct sdp_sock *ssk) { SDP_WLOCK_ASSERT(ssk); callout_reset(&ssk->keep2msl, SDP_FIN_WAIT_TIMEOUT, sdp_dreq_timeout, ssk); ssk->flags |= SDP_NEEDFIN | SDP_DREQWAIT; sdp_post_sends(ssk, M_NOWAIT); } /* * Initiate or continue a disconnect. * If embryonic state, just send reset (once). * If in ``let data drain'' option and linger null, just drop. * Otherwise (hard), mark socket disconnecting and drop * current input data; switch states based on user close, and * send segment to peer (with FIN). */ static void sdp_start_disconnect(struct sdp_sock *ssk) { struct socket *so; int unread; so = ssk->socket; SDP_WLOCK_ASSERT(ssk); sdp_stop_keepalive_timer(so); /* * Neither sdp_closed() nor sdp_drop() should return NULL, as the * socket is still open. */ if (ssk->state < TCPS_ESTABLISHED) { ssk = sdp_closed(ssk); KASSERT(ssk != NULL, ("sdp_start_disconnect: sdp_close() returned NULL")); } else if ((so->so_options & SO_LINGER) && so->so_linger == 0) { ssk = sdp_drop(ssk, 0); KASSERT(ssk != NULL, ("sdp_start_disconnect: sdp_drop() returned NULL")); } else { soisdisconnecting(so); unread = sbused(&so->so_rcv); sbflush(&so->so_rcv); sdp_usrclosed(ssk); if (!(ssk->flags & SDP_DROPPED)) { if (unread) sdp_output_reset(ssk); else sdp_output_disconnect(ssk); } } } /* * User initiated disconnect. */ static int sdp_disconnect(struct socket *so) { struct sdp_sock *ssk; int error = 0; ssk = sdp_sk(so); SDP_WLOCK(ssk); if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) { error = ECONNRESET; goto out; } sdp_start_disconnect(ssk); out: SDP_WUNLOCK(ssk); return (error); } /* * Accept a connection. Essentially all the work is done at higher levels; * just return the address of the peer, storing through addr. * * * XXX This is broken XXX * * The rationale for acquiring the sdp lock here is somewhat complicated, * and is described in detail in the commit log entry for r175612. Acquiring * it delays an accept(2) racing with sonewconn(), which inserts the socket * before the address/port fields are initialized. A better fix would * prevent the socket from being placed in the listen queue until all fields * are fully initialized. */ static int sdp_accept(struct socket *so, struct sockaddr **nam) { struct sdp_sock *ssk = NULL; struct in_addr addr; in_port_t port; int error; if (so->so_state & SS_ISDISCONNECTED) return (ECONNABORTED); port = 0; addr.s_addr = 0; error = 0; ssk = sdp_sk(so); SDP_WLOCK(ssk); if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) { error = ECONNABORTED; goto out; } port = ssk->fport; addr.s_addr = ssk->faddr; out: SDP_WUNLOCK(ssk); if (error == 0) *nam = sdp_sockaddr(port, &addr); return error; } /* * Mark the connection as being incapable of further output. */ static int sdp_shutdown(struct socket *so) { int error = 0; struct sdp_sock *ssk; ssk = sdp_sk(so); SDP_WLOCK(ssk); if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) { error = ECONNRESET; goto out; } socantsendmore(so); sdp_usrclosed(ssk); if (!(ssk->flags & SDP_DROPPED)) sdp_output_disconnect(ssk); out: SDP_WUNLOCK(ssk); return (error); } static void sdp_append(struct sdp_sock *ssk, struct sockbuf *sb, struct mbuf *mb, int cnt) { struct mbuf *n; int ncnt; SOCKBUF_LOCK_ASSERT(sb); SBLASTRECORDCHK(sb); KASSERT(mb->m_flags & M_PKTHDR, ("sdp_append: %p Missing packet header.\n", mb)); n = sb->sb_lastrecord; /* * If the queue is empty just set all pointers and proceed. */ if (n == NULL) { sb->sb_lastrecord = sb->sb_mb = sb->sb_sndptr = mb; for (; mb; mb = mb->m_next) { sb->sb_mbtail = mb; sballoc(sb, mb); } return; } /* * Count the number of mbufs in the current tail. */ for (ncnt = 0; n->m_next; n = n->m_next) ncnt++; n = sb->sb_lastrecord; /* * If the two chains can fit in a single sdp packet and * the last record has not been sent yet (WRITABLE) coalesce * them. The lastrecord remains the same but we must strip the * packet header and then let sbcompress do the hard part. */ if (M_WRITABLE(n) && ncnt + cnt < SDP_MAX_SEND_SGES && n->m_pkthdr.len + mb->m_pkthdr.len - SDP_HEAD_SIZE < ssk->xmit_size_goal) { m_adj(mb, SDP_HEAD_SIZE); n->m_pkthdr.len += mb->m_pkthdr.len; n->m_flags |= mb->m_flags & (M_PUSH | M_URG); m_demote(mb, 1, 0); sbcompress(sb, mb, sb->sb_mbtail); return; } /* * Not compressible, just append to the end and adjust counters. */ sb->sb_lastrecord->m_flags |= M_PUSH; sb->sb_lastrecord->m_nextpkt = mb; sb->sb_lastrecord = mb; if (sb->sb_sndptr == NULL) sb->sb_sndptr = mb; for (; mb; mb = mb->m_next) { sb->sb_mbtail = mb; sballoc(sb, mb); } } /* * Do a send by putting data in output queue and updating urgent * marker if URG set. Possibly send more data. Unlike the other * pru_*() routines, the mbuf chains are our responsibility. We * must either enqueue them or free them. The other pru_* routines * generally are caller-frees. * * This comes from sendfile, normal sends will come from sdp_sosend(). */ static int sdp_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, struct mbuf *control, struct thread *td) { struct sdp_sock *ssk; struct mbuf *n; int error; int cnt; error = 0; ssk = sdp_sk(so); KASSERT(m->m_flags & M_PKTHDR, ("sdp_send: %p no packet header", m)); M_PREPEND(m, SDP_HEAD_SIZE, M_WAITOK); mtod(m, struct sdp_bsdh *)->mid = SDP_MID_DATA; for (n = m, cnt = 0; n->m_next; n = n->m_next) cnt++; if (cnt > SDP_MAX_SEND_SGES) { n = m_collapse(m, M_WAITOK, SDP_MAX_SEND_SGES); if (n == NULL) { m_freem(m); return (EMSGSIZE); } m = n; for (cnt = 0; n->m_next; n = n->m_next) cnt++; } SDP_WLOCK(ssk); if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) { if (control) m_freem(control); if (m) m_freem(m); error = ECONNRESET; goto out; } if (control) { /* SDP doesn't support control messages. */ if (control->m_len) { m_freem(control); if (m) m_freem(m); error = EINVAL; goto out; } m_freem(control); /* empty control, just free it */ } if (!(flags & PRUS_OOB)) { SOCKBUF_LOCK(&so->so_snd); sdp_append(ssk, &so->so_snd, m, cnt); SOCKBUF_UNLOCK(&so->so_snd); if (nam && ssk->state < TCPS_SYN_SENT) { /* * Do implied connect if not yet connected. */ error = sdp_start_connect(ssk, nam, td); if (error) goto out; } if (flags & PRUS_EOF) { /* * Close the send side of the connection after * the data is sent. */ socantsendmore(so); sdp_usrclosed(ssk); if (!(ssk->flags & SDP_DROPPED)) sdp_output_disconnect(ssk); } else if (!(ssk->flags & SDP_DROPPED) && !(flags & PRUS_MORETOCOME)) sdp_post_sends(ssk, M_NOWAIT); SDP_WUNLOCK(ssk); return (0); } else { SOCKBUF_LOCK(&so->so_snd); if (sbspace(&so->so_snd) < -512) { SOCKBUF_UNLOCK(&so->so_snd); m_freem(m); error = ENOBUFS; goto out; } /* * According to RFC961 (Assigned Protocols), * the urgent pointer points to the last octet * of urgent data. We continue, however, * to consider it to indicate the first octet * of data past the urgent section. * Otherwise, snd_up should be one lower. */ m->m_flags |= M_URG | M_PUSH; sdp_append(ssk, &so->so_snd, m, cnt); SOCKBUF_UNLOCK(&so->so_snd); if (nam && ssk->state < TCPS_SYN_SENT) { /* * Do implied connect if not yet connected. */ error = sdp_start_connect(ssk, nam, td); if (error) goto out; } sdp_post_sends(ssk, M_NOWAIT); SDP_WUNLOCK(ssk); return (0); } out: SDP_WUNLOCK(ssk); return (error); } #define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT) /* * Send on a socket. If send must go all at once and message is larger than * send buffering, then hard error. Lock against other senders. If must go * all at once and not enough room now, then inform user that this would * block and do nothing. Otherwise, if nonblocking, send as much as * possible. The data to be sent is described by "uio" if nonzero, otherwise * by the mbuf chain "top" (which must be null if uio is not). Data provided * in mbuf chain must be small enough to send all at once. * * Returns nonzero on error, timeout or signal; callers must check for short * counts if EINTR/ERESTART are returned. Data and control buffers are freed * on return. */ static int sdp_sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, struct mbuf *top, struct mbuf *control, int flags, struct thread *td) { struct sdp_sock *ssk; long space, resid; int atomic; int error; int copy; if (uio != NULL) resid = uio->uio_resid; else resid = top->m_pkthdr.len; atomic = top != NULL; if (control != NULL) { if (control->m_len) { m_freem(control); if (top) m_freem(top); return (EINVAL); } m_freem(control); control = NULL; } /* * In theory resid should be unsigned. However, space must be * signed, as it might be less than 0 if we over-committed, and we * must use a signed comparison of space and resid. On the other * hand, a negative resid causes us to loop sending 0-length * segments to the protocol. * * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM * type sockets since that's an error. */ if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) { error = EINVAL; goto out; } if (td != NULL) td->td_ru.ru_msgsnd++; ssk = sdp_sk(so); error = sblock(&so->so_snd, SBLOCKWAIT(flags)); if (error) goto out; restart: do { SOCKBUF_LOCK(&so->so_snd); if (so->so_snd.sb_state & SBS_CANTSENDMORE) { SOCKBUF_UNLOCK(&so->so_snd); error = EPIPE; goto release; } if (so->so_error) { error = so->so_error; so->so_error = 0; SOCKBUF_UNLOCK(&so->so_snd); goto release; } if ((so->so_state & SS_ISCONNECTED) == 0 && addr == NULL) { SOCKBUF_UNLOCK(&so->so_snd); error = ENOTCONN; goto release; } space = sbspace(&so->so_snd); if (flags & MSG_OOB) space += 1024; if (atomic && resid > ssk->xmit_size_goal - SDP_HEAD_SIZE) { SOCKBUF_UNLOCK(&so->so_snd); error = EMSGSIZE; goto release; } if (space < resid && (atomic || space < so->so_snd.sb_lowat)) { if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO)) { SOCKBUF_UNLOCK(&so->so_snd); error = EWOULDBLOCK; goto release; } error = sbwait(&so->so_snd); SOCKBUF_UNLOCK(&so->so_snd); if (error) goto release; goto restart; } SOCKBUF_UNLOCK(&so->so_snd); do { if (uio == NULL) { resid = 0; if (flags & MSG_EOR) top->m_flags |= M_EOR; } else { /* * Copy the data from userland into a mbuf * chain. If no data is to be copied in, * a single empty mbuf is returned. */ copy = min(space, ssk->xmit_size_goal - SDP_HEAD_SIZE); top = m_uiotombuf(uio, M_WAITOK, copy, 0, M_PKTHDR | ((flags & MSG_EOR) ? M_EOR : 0)); if (top == NULL) { /* only possible error */ error = EFAULT; goto release; } space -= resid - uio->uio_resid; resid = uio->uio_resid; } /* * XXX all the SBS_CANTSENDMORE checks previously * done could be out of date after dropping the * socket lock. */ error = sdp_send(so, (flags & MSG_OOB) ? PRUS_OOB : /* * Set EOF on the last send if the user specified * MSG_EOF. */ ((flags & MSG_EOF) && (resid <= 0)) ? PRUS_EOF : /* If there is more to send set PRUS_MORETOCOME. */ (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0, top, addr, NULL, td); top = NULL; if (error) goto release; } while (resid && space > 0); } while (resid); release: sbunlock(&so->so_snd); out: if (top != NULL) m_freem(top); return (error); } /* * The part of soreceive() that implements reading non-inline out-of-band * data from a socket. For more complete comments, see soreceive(), from * which this code originated. * * Note that soreceive_rcvoob(), unlike the remainder of soreceive(), is * unable to return an mbuf chain to the caller. */ static int soreceive_rcvoob(struct socket *so, struct uio *uio, int flags) { struct protosw *pr = so->so_proto; struct mbuf *m; int error; KASSERT(flags & MSG_OOB, ("soreceive_rcvoob: (flags & MSG_OOB) == 0")); m = m_get(M_WAITOK, MT_DATA); error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK); if (error) goto bad; do { error = uiomove(mtod(m, void *), (int) min(uio->uio_resid, m->m_len), uio); m = m_free(m); } while (uio->uio_resid && error == 0 && m); bad: if (m != NULL) m_freem(m); return (error); } /* * Optimized version of soreceive() for stream (TCP) sockets. */ static int sdp_sorecv(struct socket *so, struct sockaddr **psa, struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp) { int len = 0, error = 0, flags, oresid; struct sockbuf *sb; struct mbuf *m, *n = NULL; struct sdp_sock *ssk; /* We only do stream sockets. */ if (so->so_type != SOCK_STREAM) return (EINVAL); if (psa != NULL) *psa = NULL; if (controlp != NULL) return (EINVAL); if (flagsp != NULL) flags = *flagsp &~ MSG_EOR; else flags = 0; if (flags & MSG_OOB) return (soreceive_rcvoob(so, uio, flags)); if (mp0 != NULL) *mp0 = NULL; sb = &so->so_rcv; ssk = sdp_sk(so); /* Prevent other readers from entering the socket. */ error = sblock(sb, SBLOCKWAIT(flags)); if (error) goto out; SOCKBUF_LOCK(sb); /* Easy one, no space to copyout anything. */ if (uio->uio_resid == 0) { error = EINVAL; goto out; } oresid = uio->uio_resid; /* We will never ever get anything unless we are connected. */ if (!(so->so_state & (SS_ISCONNECTED|SS_ISDISCONNECTED))) { /* When disconnecting there may be still some data left. */ if (sbavail(sb)) goto deliver; if (!(so->so_state & SS_ISDISCONNECTED)) error = ENOTCONN; goto out; } /* Socket buffer is empty and we shall not block. */ if (sbavail(sb) == 0 && ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO)))) { error = EAGAIN; goto out; } restart: SOCKBUF_LOCK_ASSERT(&so->so_rcv); /* Abort if socket has reported problems. */ if (so->so_error) { if (sbavail(sb)) goto deliver; if (oresid > uio->uio_resid) goto out; error = so->so_error; if (!(flags & MSG_PEEK)) so->so_error = 0; goto out; } /* Door is closed. Deliver what is left, if any. */ if (sb->sb_state & SBS_CANTRCVMORE) { if (sbavail(sb)) goto deliver; else goto out; } /* Socket buffer got some data that we shall deliver now. */ if (sbavail(sb) && !(flags & MSG_WAITALL) && ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO)) || sbavail(sb) >= sb->sb_lowat || sbavail(sb) >= uio->uio_resid || sbavail(sb) >= sb->sb_hiwat) ) { goto deliver; } /* On MSG_WAITALL we must wait until all data or error arrives. */ if ((flags & MSG_WAITALL) && (sbavail(sb) >= uio->uio_resid || sbavail(sb) >= sb->sb_lowat)) goto deliver; /* * Wait and block until (more) data comes in. * NB: Drops the sockbuf lock during wait. */ error = sbwait(sb); if (error) goto out; goto restart; deliver: SOCKBUF_LOCK_ASSERT(&so->so_rcv); KASSERT(sbavail(sb), ("%s: sockbuf empty", __func__)); KASSERT(sb->sb_mb != NULL, ("%s: sb_mb == NULL", __func__)); /* Statistics. */ if (uio->uio_td) uio->uio_td->td_ru.ru_msgrcv++; /* Fill uio until full or current end of socket buffer is reached. */ len = min(uio->uio_resid, sbavail(sb)); if (mp0 != NULL) { /* Dequeue as many mbufs as possible. */ if (!(flags & MSG_PEEK) && len >= sb->sb_mb->m_len) { for (*mp0 = m = sb->sb_mb; m != NULL && m->m_len <= len; m = m->m_next) { len -= m->m_len; uio->uio_resid -= m->m_len; sbfree(sb, m); n = m; } sb->sb_mb = m; if (sb->sb_mb == NULL) SB_EMPTY_FIXUP(sb); n->m_next = NULL; } /* Copy the remainder. */ if (len > 0) { KASSERT(sb->sb_mb != NULL, ("%s: len > 0 && sb->sb_mb empty", __func__)); m = m_copym(sb->sb_mb, 0, len, M_NOWAIT); if (m == NULL) len = 0; /* Don't flush data from sockbuf. */ else uio->uio_resid -= m->m_len; if (*mp0 != NULL) n->m_next = m; else *mp0 = m; if (*mp0 == NULL) { error = ENOBUFS; goto out; } } } else { /* NB: Must unlock socket buffer as uiomove may sleep. */ SOCKBUF_UNLOCK(sb); error = m_mbuftouio(uio, sb->sb_mb, len); SOCKBUF_LOCK(sb); if (error) goto out; } SBLASTRECORDCHK(sb); SBLASTMBUFCHK(sb); /* * Remove the delivered data from the socket buffer unless we * were only peeking. */ if (!(flags & MSG_PEEK)) { if (len > 0) sbdrop_locked(sb, len); /* Notify protocol that we drained some data. */ SOCKBUF_UNLOCK(sb); SDP_WLOCK(ssk); sdp_do_posts(ssk); SDP_WUNLOCK(ssk); SOCKBUF_LOCK(sb); } /* * For MSG_WAITALL we may have to loop again and wait for * more data to come in. */ if ((flags & MSG_WAITALL) && uio->uio_resid > 0) goto restart; out: SOCKBUF_LOCK_ASSERT(sb); SBLASTRECORDCHK(sb); SBLASTMBUFCHK(sb); SOCKBUF_UNLOCK(sb); sbunlock(sb); return (error); } /* * Abort is used to teardown a connection typically while sitting in * the accept queue. */ void sdp_abort(struct socket *so) { struct sdp_sock *ssk; ssk = sdp_sk(so); SDP_WLOCK(ssk); /* * If we have not yet dropped, do it now. */ if (!(ssk->flags & SDP_TIMEWAIT) && !(ssk->flags & SDP_DROPPED)) sdp_drop(ssk, ECONNABORTED); KASSERT(ssk->flags & SDP_DROPPED, ("sdp_abort: %p not dropped 0x%X", ssk, ssk->flags)); SDP_WUNLOCK(ssk); } /* * Close a SDP socket and initiate a friendly disconnect. */ static void sdp_close(struct socket *so) { struct sdp_sock *ssk; ssk = sdp_sk(so); SDP_WLOCK(ssk); /* * If we have not yet dropped, do it now. */ if (!(ssk->flags & SDP_TIMEWAIT) && !(ssk->flags & SDP_DROPPED)) sdp_start_disconnect(ssk); /* * If we've still not dropped let the socket layer know we're * holding on to the socket and pcb for a while. */ if (!(ssk->flags & SDP_DROPPED)) { SOCK_LOCK(so); so->so_state |= SS_PROTOREF; SOCK_UNLOCK(so); ssk->flags |= SDP_SOCKREF; } SDP_WUNLOCK(ssk); } /* * User requests out-of-band data. */ static int sdp_rcvoob(struct socket *so, struct mbuf *m, int flags) { int error = 0; struct sdp_sock *ssk; ssk = sdp_sk(so); SDP_WLOCK(ssk); if (!rx_ring_trylock(&ssk->rx_ring)) { SDP_WUNLOCK(ssk); return (ECONNRESET); } if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) { error = ECONNRESET; goto out; } if ((so->so_oobmark == 0 && (so->so_rcv.sb_state & SBS_RCVATMARK) == 0) || so->so_options & SO_OOBINLINE || ssk->oobflags & SDP_HADOOB) { error = EINVAL; goto out; } if ((ssk->oobflags & SDP_HAVEOOB) == 0) { error = EWOULDBLOCK; goto out; } m->m_len = 1; *mtod(m, caddr_t) = ssk->iobc; if ((flags & MSG_PEEK) == 0) ssk->oobflags ^= (SDP_HAVEOOB | SDP_HADOOB); out: rx_ring_unlock(&ssk->rx_ring); SDP_WUNLOCK(ssk); return (error); } void sdp_urg(struct sdp_sock *ssk, struct mbuf *mb) { struct mbuf *m; struct socket *so; so = ssk->socket; if (so == NULL) return; so->so_oobmark = sbused(&so->so_rcv) + mb->m_pkthdr.len - 1; sohasoutofband(so); ssk->oobflags &= ~(SDP_HAVEOOB | SDP_HADOOB); if (!(so->so_options & SO_OOBINLINE)) { for (m = mb; m->m_next != NULL; m = m->m_next); ssk->iobc = *(mtod(m, char *) + m->m_len - 1); ssk->oobflags |= SDP_HAVEOOB; m->m_len--; mb->m_pkthdr.len--; } } /* * Notify a sdp socket of an asynchronous error. * * Do not wake up user since there currently is no mechanism for * reporting soft errors (yet - a kqueue filter may be added). */ struct sdp_sock * sdp_notify(struct sdp_sock *ssk, int error) { SDP_WLOCK_ASSERT(ssk); if ((ssk->flags & SDP_TIMEWAIT) || (ssk->flags & SDP_DROPPED)) return (ssk); /* * Ignore some errors if we are hooked up. */ if (ssk->state == TCPS_ESTABLISHED && (error == EHOSTUNREACH || error == ENETUNREACH || error == EHOSTDOWN)) return (ssk); ssk->softerror = error; return sdp_drop(ssk, error); } static void sdp_ctlinput(int cmd, struct sockaddr *sa, void *vip) { struct in_addr faddr; faddr = ((struct sockaddr_in *)sa)->sin_addr; if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY) return; sdp_pcbnotifyall(faddr, inetctlerrmap[cmd], sdp_notify); } static int sdp_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, struct thread *td) { return (EOPNOTSUPP); } static void sdp_keepalive_timeout(void *data) { struct sdp_sock *ssk; ssk = data; /* Callout canceled. */ if (!callout_active(&ssk->keep2msl)) return; /* Callout rescheduled as a different kind of timer. */ if (callout_pending(&ssk->keep2msl)) goto out; callout_deactivate(&ssk->keep2msl); if (ssk->flags & SDP_DROPPED || (ssk->socket->so_options & SO_KEEPALIVE) == 0) goto out; sdp_post_keepalive(ssk); callout_reset(&ssk->keep2msl, SDP_KEEPALIVE_TIME, sdp_keepalive_timeout, ssk); out: SDP_WUNLOCK(ssk); } void sdp_start_keepalive_timer(struct socket *so) { struct sdp_sock *ssk; ssk = sdp_sk(so); if (!callout_pending(&ssk->keep2msl)) callout_reset(&ssk->keep2msl, SDP_KEEPALIVE_TIME, sdp_keepalive_timeout, ssk); } static void sdp_stop_keepalive_timer(struct socket *so) { struct sdp_sock *ssk; ssk = sdp_sk(so); callout_stop(&ssk->keep2msl); } /* * sdp_ctloutput() must drop the inpcb lock before performing copyin on * socket option arguments. When it re-acquires the lock after the copy, it * has to revalidate that the connection is still valid for the socket * option. */ #define SDP_WLOCK_RECHECK(inp) do { \ SDP_WLOCK(ssk); \ if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) { \ SDP_WUNLOCK(ssk); \ return (ECONNRESET); \ } \ } while(0) static int sdp_ctloutput(struct socket *so, struct sockopt *sopt) { int error, opt, optval; struct sdp_sock *ssk; error = 0; ssk = sdp_sk(so); if (sopt->sopt_level == SOL_SOCKET && sopt->sopt_name == SO_KEEPALIVE) { SDP_WLOCK(ssk); if (so->so_options & SO_KEEPALIVE) sdp_start_keepalive_timer(so); else sdp_stop_keepalive_timer(so); SDP_WUNLOCK(ssk); } if (sopt->sopt_level != IPPROTO_TCP) return (error); SDP_WLOCK(ssk); if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) { SDP_WUNLOCK(ssk); return (ECONNRESET); } switch (sopt->sopt_dir) { case SOPT_SET: switch (sopt->sopt_name) { case TCP_NODELAY: SDP_WUNLOCK(ssk); error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) return (error); SDP_WLOCK_RECHECK(ssk); opt = SDP_NODELAY; if (optval) ssk->flags |= opt; else ssk->flags &= ~opt; sdp_do_posts(ssk); SDP_WUNLOCK(ssk); break; default: SDP_WUNLOCK(ssk); error = ENOPROTOOPT; break; } break; case SOPT_GET: switch (sopt->sopt_name) { case TCP_NODELAY: optval = ssk->flags & SDP_NODELAY; SDP_WUNLOCK(ssk); error = sooptcopyout(sopt, &optval, sizeof optval); break; default: SDP_WUNLOCK(ssk); error = ENOPROTOOPT; break; } break; } return (error); } #undef SDP_WLOCK_RECHECK int sdp_mod_count = 0; int sdp_mod_usec = 0; void sdp_set_default_moderation(struct sdp_sock *ssk) { struct ib_cq_attr attr; if (sdp_mod_count <= 0 || sdp_mod_usec <= 0) return; memset(&attr, 0, sizeof(attr)); attr.moderation.cq_count = sdp_mod_count; attr.moderation.cq_period = sdp_mod_usec; ib_modify_cq(ssk->rx_ring.cq, &attr, IB_CQ_MODERATION); } static void sdp_dev_add(struct ib_device *device) { struct ib_fmr_pool_param param; struct sdp_device *sdp_dev; sdp_dev = malloc(sizeof(*sdp_dev), M_SDP, M_WAITOK | M_ZERO); sdp_dev->pd = ib_alloc_pd(device); if (IS_ERR(sdp_dev->pd)) goto out_pd; sdp_dev->mr = ib_get_dma_mr(sdp_dev->pd, IB_ACCESS_LOCAL_WRITE); if (IS_ERR(sdp_dev->mr)) goto out_mr; memset(¶m, 0, sizeof param); param.max_pages_per_fmr = SDP_FMR_SIZE; param.page_shift = PAGE_SHIFT; param.access = (IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_READ); param.pool_size = SDP_FMR_POOL_SIZE; param.dirty_watermark = SDP_FMR_DIRTY_SIZE; param.cache = 1; sdp_dev->fmr_pool = ib_create_fmr_pool(sdp_dev->pd, ¶m); if (IS_ERR(sdp_dev->fmr_pool)) goto out_fmr; ib_set_client_data(device, &sdp_client, sdp_dev); return; out_fmr: ib_dereg_mr(sdp_dev->mr); out_mr: ib_dealloc_pd(sdp_dev->pd); out_pd: free(sdp_dev, M_SDP); } static void sdp_dev_rem(struct ib_device *device) { struct sdp_device *sdp_dev; struct sdp_sock *ssk; SDP_LIST_WLOCK(); LIST_FOREACH(ssk, &sdp_list, list) { if (ssk->ib_device != device) continue; SDP_WLOCK(ssk); if ((ssk->flags & SDP_DESTROY) == 0) ssk = sdp_notify(ssk, ECONNRESET); if (ssk) SDP_WUNLOCK(ssk); } SDP_LIST_WUNLOCK(); /* * XXX Do I need to wait between these two? */ sdp_dev = ib_get_client_data(device, &sdp_client); if (!sdp_dev) return; ib_flush_fmr_pool(sdp_dev->fmr_pool); ib_destroy_fmr_pool(sdp_dev->fmr_pool); ib_dereg_mr(sdp_dev->mr); ib_dealloc_pd(sdp_dev->pd); free(sdp_dev, M_SDP); } struct ib_client sdp_client = { .name = "sdp", .add = sdp_dev_add, .remove = sdp_dev_rem }; static int sdp_pcblist(SYSCTL_HANDLER_ARGS) { int error, n, i; struct sdp_sock *ssk; struct xinpgen xig; /* * The process of preparing the TCB list is too time-consuming and * resource-intensive to repeat twice on every request. */ if (req->oldptr == NULL) { n = sdp_count; n += imax(n / 8, 10); req->oldidx = 2 * (sizeof xig) + n * sizeof(struct xtcpcb); return (0); } if (req->newptr != NULL) return (EPERM); /* * OK, now we're committed to doing something. */ SDP_LIST_RLOCK(); n = sdp_count; SDP_LIST_RUNLOCK(); error = sysctl_wire_old_buffer(req, 2 * (sizeof xig) + n * sizeof(struct xtcpcb)); if (error != 0) return (error); xig.xig_len = sizeof xig; xig.xig_count = n; xig.xig_gen = 0; xig.xig_sogen = so_gencnt; error = SYSCTL_OUT(req, &xig, sizeof xig); if (error) return (error); SDP_LIST_RLOCK(); for (ssk = LIST_FIRST(&sdp_list), i = 0; ssk != NULL && i < n; ssk = LIST_NEXT(ssk, list)) { struct xtcpcb xt; SDP_RLOCK(ssk); if (ssk->flags & SDP_TIMEWAIT) { if (ssk->cred != NULL) error = cr_cansee(req->td->td_ucred, ssk->cred); else error = EINVAL; /* Skip this inp. */ } else if (ssk->socket) error = cr_canseesocket(req->td->td_ucred, ssk->socket); else error = EINVAL; if (error) { error = 0; goto next; } bzero(&xt, sizeof(xt)); xt.xt_len = sizeof xt; xt.xt_inp.inp_gencnt = 0; xt.xt_inp.inp_vflag = INP_IPV4; memcpy(&xt.xt_inp.inp_laddr, &ssk->laddr, sizeof(ssk->laddr)); xt.xt_inp.inp_lport = ssk->lport; memcpy(&xt.xt_inp.inp_faddr, &ssk->faddr, sizeof(ssk->faddr)); xt.xt_inp.inp_fport = ssk->fport; xt.t_state = ssk->state; if (ssk->socket != NULL) sotoxsocket(ssk->socket, &xt.xt_inp.xi_socket); xt.xt_inp.xi_socket.xso_protocol = IPPROTO_TCP; SDP_RUNLOCK(ssk); error = SYSCTL_OUT(req, &xt, sizeof xt); if (error) break; i++; continue; next: SDP_RUNLOCK(ssk); } if (!error) { /* * Give the user an updated idea of our state. * If the generation differs from what we told * her before, she knows that something happened * while we were processing this request, and it * might be necessary to retry. */ xig.xig_gen = 0; xig.xig_sogen = so_gencnt; xig.xig_count = sdp_count; error = SYSCTL_OUT(req, &xig, sizeof xig); } SDP_LIST_RUNLOCK(); return (error); } static SYSCTL_NODE(_net_inet, -1, sdp, CTLFLAG_RW, 0, "SDP"); SYSCTL_PROC(_net_inet_sdp, TCPCTL_PCBLIST, pcblist, CTLFLAG_RD | CTLTYPE_STRUCT, 0, 0, sdp_pcblist, "S,xtcpcb", "List of active SDP connections"); static void sdp_zone_change(void *tag) { uma_zone_set_max(sdp_zone, maxsockets); } static void sdp_init(void) { LIST_INIT(&sdp_list); sdp_zone = uma_zcreate("sdp_sock", sizeof(struct sdp_sock), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); uma_zone_set_max(sdp_zone, maxsockets); EVENTHANDLER_REGISTER(maxsockets_change, sdp_zone_change, NULL, EVENTHANDLER_PRI_ANY); rx_comp_wq = create_singlethread_workqueue("rx_comp_wq"); ib_register_client(&sdp_client); } extern struct domain sdpdomain; struct pr_usrreqs sdp_usrreqs = { .pru_abort = sdp_abort, .pru_accept = sdp_accept, .pru_attach = sdp_attach, .pru_bind = sdp_bind, .pru_connect = sdp_connect, .pru_control = sdp_control, .pru_detach = sdp_detach, .pru_disconnect = sdp_disconnect, .pru_listen = sdp_listen, .pru_peeraddr = sdp_getpeeraddr, .pru_rcvoob = sdp_rcvoob, .pru_send = sdp_send, .pru_sosend = sdp_sosend, .pru_soreceive = sdp_sorecv, .pru_shutdown = sdp_shutdown, .pru_sockaddr = sdp_getsockaddr, .pru_close = sdp_close, }; struct protosw sdpsw[] = { { .pr_type = SOCK_STREAM, .pr_domain = &sdpdomain, .pr_protocol = IPPROTO_IP, .pr_flags = PR_CONNREQUIRED|PR_IMPLOPCL|PR_WANTRCVD, .pr_ctlinput = sdp_ctlinput, .pr_ctloutput = sdp_ctloutput, .pr_usrreqs = &sdp_usrreqs }, { .pr_type = SOCK_STREAM, .pr_domain = &sdpdomain, .pr_protocol = IPPROTO_TCP, .pr_flags = PR_CONNREQUIRED|PR_IMPLOPCL|PR_WANTRCVD, .pr_ctlinput = sdp_ctlinput, .pr_ctloutput = sdp_ctloutput, .pr_usrreqs = &sdp_usrreqs }, }; struct domain sdpdomain = { .dom_family = AF_INET_SDP, .dom_name = "SDP", .dom_init = sdp_init, .dom_protosw = sdpsw, .dom_protoswNPROTOSW = &sdpsw[sizeof(sdpsw)/sizeof(sdpsw[0])], }; DOMAIN_SET(sdp); int sdp_debug_level = 1; int sdp_data_debug_level = 0; Index: head/sys/rpc/svc_vc.c =================================================================== --- head/sys/rpc/svc_vc.c (revision 319721) +++ head/sys/rpc/svc_vc.c (revision 319722) @@ -1,983 +1,992 @@ /* $NetBSD: svc_vc.c,v 1.7 2000/08/03 00:01:53 fvdl Exp $ */ /*- * Copyright (c) 2009, Sun Microsystems, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * - Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * - Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - Neither the name of Sun Microsystems, Inc. nor the names of its * contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #if defined(LIBC_SCCS) && !defined(lint) static char *sccsid2 = "@(#)svc_tcp.c 1.21 87/08/11 Copyr 1984 Sun Micro"; static char *sccsid = "@(#)svc_tcp.c 2.2 88/08/01 4.0 RPCSRC"; #endif #include __FBSDID("$FreeBSD$"); /* * svc_vc.c, Server side for Connection Oriented based RPC. * * Actually implements two flavors of transporter - * a tcp rendezvouser (a listner and connection establisher) * and a record/tcp stream. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static bool_t svc_vc_rendezvous_recv(SVCXPRT *, struct rpc_msg *, struct sockaddr **, struct mbuf **); static enum xprt_stat svc_vc_rendezvous_stat(SVCXPRT *); static void svc_vc_rendezvous_destroy(SVCXPRT *); static bool_t svc_vc_null(void); static void svc_vc_destroy(SVCXPRT *); static enum xprt_stat svc_vc_stat(SVCXPRT *); static bool_t svc_vc_ack(SVCXPRT *, uint32_t *); static bool_t svc_vc_recv(SVCXPRT *, struct rpc_msg *, struct sockaddr **, struct mbuf **); static bool_t svc_vc_reply(SVCXPRT *, struct rpc_msg *, struct sockaddr *, struct mbuf *, uint32_t *seq); static bool_t svc_vc_control(SVCXPRT *xprt, const u_int rq, void *in); static bool_t svc_vc_rendezvous_control (SVCXPRT *xprt, const u_int rq, void *in); static void svc_vc_backchannel_destroy(SVCXPRT *); static enum xprt_stat svc_vc_backchannel_stat(SVCXPRT *); static bool_t svc_vc_backchannel_recv(SVCXPRT *, struct rpc_msg *, struct sockaddr **, struct mbuf **); static bool_t svc_vc_backchannel_reply(SVCXPRT *, struct rpc_msg *, struct sockaddr *, struct mbuf *, uint32_t *); static bool_t svc_vc_backchannel_control(SVCXPRT *xprt, const u_int rq, void *in); static SVCXPRT *svc_vc_create_conn(SVCPOOL *pool, struct socket *so, struct sockaddr *raddr); static int svc_vc_accept(struct socket *head, struct socket **sop); static int svc_vc_soupcall(struct socket *so, void *arg, int waitflag); +static int svc_vc_rendezvous_soupcall(struct socket *, void *, int); static struct xp_ops svc_vc_rendezvous_ops = { .xp_recv = svc_vc_rendezvous_recv, .xp_stat = svc_vc_rendezvous_stat, .xp_reply = (bool_t (*)(SVCXPRT *, struct rpc_msg *, struct sockaddr *, struct mbuf *, uint32_t *))svc_vc_null, .xp_destroy = svc_vc_rendezvous_destroy, .xp_control = svc_vc_rendezvous_control }; static struct xp_ops svc_vc_ops = { .xp_recv = svc_vc_recv, .xp_stat = svc_vc_stat, .xp_ack = svc_vc_ack, .xp_reply = svc_vc_reply, .xp_destroy = svc_vc_destroy, .xp_control = svc_vc_control }; static struct xp_ops svc_vc_backchannel_ops = { .xp_recv = svc_vc_backchannel_recv, .xp_stat = svc_vc_backchannel_stat, .xp_reply = svc_vc_backchannel_reply, .xp_destroy = svc_vc_backchannel_destroy, .xp_control = svc_vc_backchannel_control }; /* * Usage: * xprt = svc_vc_create(sock, send_buf_size, recv_buf_size); * * Creates, registers, and returns a (rpc) tcp based transporter. * Once *xprt is initialized, it is registered as a transporter * see (svc.h, xprt_register). This routine returns * a NULL if a problem occurred. * * The filedescriptor passed in is expected to refer to a bound, but * not yet connected socket. * * Since streams do buffered io similar to stdio, the caller can specify * how big the send and receive buffers are via the second and third parms; * 0 => use the system default. */ SVCXPRT * svc_vc_create(SVCPOOL *pool, struct socket *so, size_t sendsize, size_t recvsize) { SVCXPRT *xprt; struct sockaddr* sa; int error; SOCK_LOCK(so); if (so->so_state & (SS_ISCONNECTED|SS_ISDISCONNECTED)) { SOCK_UNLOCK(so); CURVNET_SET(so->so_vnet); error = so->so_proto->pr_usrreqs->pru_peeraddr(so, &sa); CURVNET_RESTORE(); if (error) return (NULL); xprt = svc_vc_create_conn(pool, so, sa); free(sa, M_SONAME); return (xprt); } SOCK_UNLOCK(so); xprt = svc_xprt_alloc(); sx_init(&xprt->xp_lock, "xprt->xp_lock"); xprt->xp_pool = pool; xprt->xp_socket = so; xprt->xp_p1 = NULL; xprt->xp_p2 = NULL; xprt->xp_ops = &svc_vc_rendezvous_ops; CURVNET_SET(so->so_vnet); error = so->so_proto->pr_usrreqs->pru_sockaddr(so, &sa); CURVNET_RESTORE(); if (error) { goto cleanup_svc_vc_create; } memcpy(&xprt->xp_ltaddr, sa, sa->sa_len); free(sa, M_SONAME); xprt_register(xprt); solisten(so, -1, curthread); - SOCKBUF_LOCK(&so->so_rcv); + SOLISTEN_LOCK(so); xprt->xp_upcallset = 1; - soupcall_set(so, SO_RCV, svc_vc_soupcall, xprt); - SOCKBUF_UNLOCK(&so->so_rcv); + solisten_upcall_set(so, svc_vc_rendezvous_soupcall, xprt); + SOLISTEN_UNLOCK(so); return (xprt); cleanup_svc_vc_create: sx_destroy(&xprt->xp_lock); svc_xprt_free(xprt); return (NULL); } /* * Create a new transport for a socket optained via soaccept(). */ SVCXPRT * svc_vc_create_conn(SVCPOOL *pool, struct socket *so, struct sockaddr *raddr) { SVCXPRT *xprt; struct cf_conn *cd; struct sockaddr* sa = NULL; struct sockopt opt; int one = 1; int error; bzero(&opt, sizeof(struct sockopt)); opt.sopt_dir = SOPT_SET; opt.sopt_level = SOL_SOCKET; opt.sopt_name = SO_KEEPALIVE; opt.sopt_val = &one; opt.sopt_valsize = sizeof(one); error = sosetopt(so, &opt); if (error) { return (NULL); } if (so->so_proto->pr_protocol == IPPROTO_TCP) { bzero(&opt, sizeof(struct sockopt)); opt.sopt_dir = SOPT_SET; opt.sopt_level = IPPROTO_TCP; opt.sopt_name = TCP_NODELAY; opt.sopt_val = &one; opt.sopt_valsize = sizeof(one); error = sosetopt(so, &opt); if (error) { return (NULL); } } cd = mem_alloc(sizeof(*cd)); cd->strm_stat = XPRT_IDLE; xprt = svc_xprt_alloc(); sx_init(&xprt->xp_lock, "xprt->xp_lock"); xprt->xp_pool = pool; xprt->xp_socket = so; xprt->xp_p1 = cd; xprt->xp_p2 = NULL; xprt->xp_ops = &svc_vc_ops; /* * See http://www.connectathon.org/talks96/nfstcp.pdf - client * has a 5 minute timer, server has a 6 minute timer. */ xprt->xp_idletimeout = 6 * 60; memcpy(&xprt->xp_rtaddr, raddr, raddr->sa_len); CURVNET_SET(so->so_vnet); error = so->so_proto->pr_usrreqs->pru_sockaddr(so, &sa); CURVNET_RESTORE(); if (error) goto cleanup_svc_vc_create; memcpy(&xprt->xp_ltaddr, sa, sa->sa_len); free(sa, M_SONAME); xprt_register(xprt); SOCKBUF_LOCK(&so->so_rcv); xprt->xp_upcallset = 1; soupcall_set(so, SO_RCV, svc_vc_soupcall, xprt); SOCKBUF_UNLOCK(&so->so_rcv); /* * Throw the transport into the active list in case it already * has some data buffered. */ sx_xlock(&xprt->xp_lock); xprt_active(xprt); sx_xunlock(&xprt->xp_lock); return (xprt); cleanup_svc_vc_create: sx_destroy(&xprt->xp_lock); svc_xprt_free(xprt); mem_free(cd, sizeof(*cd)); return (NULL); } /* * Create a new transport for a backchannel on a clnt_vc socket. */ SVCXPRT * svc_vc_create_backchannel(SVCPOOL *pool) { SVCXPRT *xprt = NULL; struct cf_conn *cd = NULL; cd = mem_alloc(sizeof(*cd)); cd->strm_stat = XPRT_IDLE; xprt = svc_xprt_alloc(); sx_init(&xprt->xp_lock, "xprt->xp_lock"); xprt->xp_pool = pool; xprt->xp_socket = NULL; xprt->xp_p1 = cd; xprt->xp_p2 = NULL; xprt->xp_ops = &svc_vc_backchannel_ops; return (xprt); } /* * This does all of the accept except the final call to soaccept. The * caller will call soaccept after dropping its locks (soaccept may * call malloc). */ int svc_vc_accept(struct socket *head, struct socket **sop) { - int error = 0; struct socket *so; + int error = 0; + short nbio; + /* XXXGL: shouldn't that be an assertion? */ if ((head->so_options & SO_ACCEPTCONN) == 0) { error = EINVAL; goto done; } #ifdef MAC error = mac_socket_check_accept(curthread->td_ucred, head); if (error != 0) goto done; #endif - ACCEPT_LOCK(); - if (TAILQ_EMPTY(&head->so_comp)) { - ACCEPT_UNLOCK(); - error = EWOULDBLOCK; - goto done; - } - so = TAILQ_FIRST(&head->so_comp); - KASSERT(!(so->so_qstate & SQ_INCOMP), ("svc_vc_accept: so SQ_INCOMP")); - KASSERT(so->so_qstate & SQ_COMP, ("svc_vc_accept: so not SQ_COMP")); - /* - * Before changing the flags on the socket, we have to bump the - * reference count. Otherwise, if the protocol calls sofree(), - * the socket will be released due to a zero refcount. - * XXX might not need soref() since this is simpler than kern_accept. + * XXXGL: we want non-blocking semantics. The socket could be a + * socket created by kernel as well as socket shared with userland, + * so we can't be sure about presense of SS_NBIO. We also shall not + * toggle it on the socket, since that may surprise userland. So we + * set SS_NBIO only temporarily. */ - SOCK_LOCK(so); /* soref() and so_state update */ - soref(so); /* file descriptor reference */ + SOLISTEN_LOCK(head); + nbio = head->so_state & SS_NBIO; + head->so_state |= SS_NBIO; + error = solisten_dequeue(head, &so, 0); + head->so_state &= (nbio & ~SS_NBIO); + if (error) + goto done; - TAILQ_REMOVE(&head->so_comp, so, so_list); - head->so_qlen--; - so->so_state |= (head->so_state & SS_NBIO); - so->so_qstate &= ~SQ_COMP; - so->so_head = NULL; - - SOCK_UNLOCK(so); - ACCEPT_UNLOCK(); - + so->so_state |= nbio; *sop = so; /* connection has been removed from the listen queue */ - KNOTE_UNLOCKED(&head->so_rcv.sb_sel.si_note, 0); + KNOTE_UNLOCKED(&head->so_rdsel.si_note, 0); done: return (error); } /*ARGSUSED*/ static bool_t svc_vc_rendezvous_recv(SVCXPRT *xprt, struct rpc_msg *msg, struct sockaddr **addrp, struct mbuf **mp) { struct socket *so = NULL; struct sockaddr *sa = NULL; int error; SVCXPRT *new_xprt; /* * The socket upcall calls xprt_active() which will eventually * cause the server to call us here. We attempt to accept a * connection from the socket and turn it into a new * transport. If the accept fails, we have drained all pending * connections so we call xprt_inactive(). */ sx_xlock(&xprt->xp_lock); error = svc_vc_accept(xprt->xp_socket, &so); if (error == EWOULDBLOCK) { /* * We must re-test for new connections after taking * the lock to protect us in the case where a new * connection arrives after our call to accept fails * with EWOULDBLOCK. */ - ACCEPT_LOCK(); - if (TAILQ_EMPTY(&xprt->xp_socket->so_comp)) + SOLISTEN_LOCK(xprt->xp_socket); + if (TAILQ_EMPTY(&xprt->xp_socket->sol_comp)) xprt_inactive_self(xprt); - ACCEPT_UNLOCK(); + SOLISTEN_UNLOCK(xprt->xp_socket); sx_xunlock(&xprt->xp_lock); return (FALSE); } if (error) { - SOCKBUF_LOCK(&xprt->xp_socket->so_rcv); + SOLISTEN_LOCK(xprt->xp_socket); if (xprt->xp_upcallset) { xprt->xp_upcallset = 0; soupcall_clear(xprt->xp_socket, SO_RCV); } - SOCKBUF_UNLOCK(&xprt->xp_socket->so_rcv); + SOLISTEN_UNLOCK(xprt->xp_socket); xprt_inactive_self(xprt); sx_xunlock(&xprt->xp_lock); return (FALSE); } sx_xunlock(&xprt->xp_lock); sa = NULL; error = soaccept(so, &sa); if (error) { /* * XXX not sure if I need to call sofree or soclose here. */ if (sa) free(sa, M_SONAME); return (FALSE); } /* * svc_vc_create_conn will call xprt_register - we don't need * to do anything with the new connection except derefence it. */ new_xprt = svc_vc_create_conn(xprt->xp_pool, so, sa); if (!new_xprt) { soclose(so); } else { SVC_RELEASE(new_xprt); } free(sa, M_SONAME); return (FALSE); /* there is never an rpc msg to be processed */ } /*ARGSUSED*/ static enum xprt_stat svc_vc_rendezvous_stat(SVCXPRT *xprt) { return (XPRT_IDLE); } static void svc_vc_destroy_common(SVCXPRT *xprt) { - SOCKBUF_LOCK(&xprt->xp_socket->so_rcv); - if (xprt->xp_upcallset) { - xprt->xp_upcallset = 0; - soupcall_clear(xprt->xp_socket, SO_RCV); - } - SOCKBUF_UNLOCK(&xprt->xp_socket->so_rcv); if (xprt->xp_socket) (void)soclose(xprt->xp_socket); if (xprt->xp_netid) (void) mem_free(xprt->xp_netid, strlen(xprt->xp_netid) + 1); svc_xprt_free(xprt); } static void svc_vc_rendezvous_destroy(SVCXPRT *xprt) { + SOLISTEN_LOCK(xprt->xp_socket); + if (xprt->xp_upcallset) { + xprt->xp_upcallset = 0; + solisten_upcall_set(xprt->xp_socket, NULL, NULL); + } + SOLISTEN_UNLOCK(xprt->xp_socket); + svc_vc_destroy_common(xprt); } static void svc_vc_destroy(SVCXPRT *xprt) { struct cf_conn *cd = (struct cf_conn *)xprt->xp_p1; + SOCKBUF_LOCK(&xprt->xp_socket->so_rcv); + if (xprt->xp_upcallset) { + xprt->xp_upcallset = 0; + soupcall_clear(xprt->xp_socket, SO_RCV); + } + SOCKBUF_UNLOCK(&xprt->xp_socket->so_rcv); + svc_vc_destroy_common(xprt); if (cd->mreq) m_freem(cd->mreq); if (cd->mpending) m_freem(cd->mpending); mem_free(cd, sizeof(*cd)); } static void svc_vc_backchannel_destroy(SVCXPRT *xprt) { struct cf_conn *cd = (struct cf_conn *)xprt->xp_p1; struct mbuf *m, *m2; svc_xprt_free(xprt); m = cd->mreq; while (m != NULL) { m2 = m; m = m->m_nextpkt; m_freem(m2); } mem_free(cd, sizeof(*cd)); } /*ARGSUSED*/ static bool_t svc_vc_control(SVCXPRT *xprt, const u_int rq, void *in) { return (FALSE); } static bool_t svc_vc_rendezvous_control(SVCXPRT *xprt, const u_int rq, void *in) { return (FALSE); } static bool_t svc_vc_backchannel_control(SVCXPRT *xprt, const u_int rq, void *in) { return (FALSE); } static enum xprt_stat svc_vc_stat(SVCXPRT *xprt) { struct cf_conn *cd; cd = (struct cf_conn *)(xprt->xp_p1); if (cd->strm_stat == XPRT_DIED) return (XPRT_DIED); if (cd->mreq != NULL && cd->resid == 0 && cd->eor) return (XPRT_MOREREQS); if (soreadable(xprt->xp_socket)) return (XPRT_MOREREQS); return (XPRT_IDLE); } static bool_t svc_vc_ack(SVCXPRT *xprt, uint32_t *ack) { *ack = atomic_load_acq_32(&xprt->xp_snt_cnt); *ack -= sbused(&xprt->xp_socket->so_snd); return (TRUE); } static enum xprt_stat svc_vc_backchannel_stat(SVCXPRT *xprt) { struct cf_conn *cd; cd = (struct cf_conn *)(xprt->xp_p1); if (cd->mreq != NULL) return (XPRT_MOREREQS); return (XPRT_IDLE); } /* * If we have an mbuf chain in cd->mpending, try to parse a record from it, * leaving the result in cd->mreq. If we don't have a complete record, leave * the partial result in cd->mreq and try to read more from the socket. */ static int svc_vc_process_pending(SVCXPRT *xprt) { struct cf_conn *cd = (struct cf_conn *) xprt->xp_p1; struct socket *so = xprt->xp_socket; struct mbuf *m; /* * If cd->resid is non-zero, we have part of the * record already, otherwise we are expecting a record * marker. */ if (!cd->resid && cd->mpending) { /* * See if there is enough data buffered to * make up a record marker. Make sure we can * handle the case where the record marker is * split across more than one mbuf. */ size_t n = 0; uint32_t header; m = cd->mpending; while (n < sizeof(uint32_t) && m) { n += m->m_len; m = m->m_next; } if (n < sizeof(uint32_t)) { so->so_rcv.sb_lowat = sizeof(uint32_t) - n; return (FALSE); } m_copydata(cd->mpending, 0, sizeof(header), (char *)&header); header = ntohl(header); cd->eor = (header & 0x80000000) != 0; cd->resid = header & 0x7fffffff; m_adj(cd->mpending, sizeof(uint32_t)); } /* * Start pulling off mbufs from cd->mpending * until we either have a complete record or * we run out of data. We use m_split to pull * data - it will pull as much as possible and * split the last mbuf if necessary. */ while (cd->mpending && cd->resid) { m = cd->mpending; if (cd->mpending->m_next || cd->mpending->m_len > cd->resid) cd->mpending = m_split(cd->mpending, cd->resid, M_WAITOK); else cd->mpending = NULL; if (cd->mreq) m_last(cd->mreq)->m_next = m; else cd->mreq = m; while (m) { cd->resid -= m->m_len; m = m->m_next; } } /* * Block receive upcalls if we have more data pending, * otherwise report our need. */ if (cd->mpending) so->so_rcv.sb_lowat = INT_MAX; else so->so_rcv.sb_lowat = imax(1, imin(cd->resid, so->so_rcv.sb_hiwat / 2)); return (TRUE); } static bool_t svc_vc_recv(SVCXPRT *xprt, struct rpc_msg *msg, struct sockaddr **addrp, struct mbuf **mp) { struct cf_conn *cd = (struct cf_conn *) xprt->xp_p1; struct uio uio; struct mbuf *m; struct socket* so = xprt->xp_socket; XDR xdrs; int error, rcvflag; uint32_t xid_plus_direction[2]; /* * Serialise access to the socket and our own record parsing * state. */ sx_xlock(&xprt->xp_lock); for (;;) { /* If we have no request ready, check pending queue. */ while (cd->mpending && (cd->mreq == NULL || cd->resid != 0 || !cd->eor)) { if (!svc_vc_process_pending(xprt)) break; } /* Process and return complete request in cd->mreq. */ if (cd->mreq != NULL && cd->resid == 0 && cd->eor) { /* * Now, check for a backchannel reply. * The XID is in the first uint32_t of the reply * and the message direction is the second one. */ if ((cd->mreq->m_len >= sizeof(xid_plus_direction) || m_length(cd->mreq, NULL) >= sizeof(xid_plus_direction)) && xprt->xp_p2 != NULL) { m_copydata(cd->mreq, 0, sizeof(xid_plus_direction), (char *)xid_plus_direction); xid_plus_direction[0] = ntohl(xid_plus_direction[0]); xid_plus_direction[1] = ntohl(xid_plus_direction[1]); /* Check message direction. */ if (xid_plus_direction[1] == REPLY) { clnt_bck_svccall(xprt->xp_p2, cd->mreq, xid_plus_direction[0]); cd->mreq = NULL; continue; } } xdrmbuf_create(&xdrs, cd->mreq, XDR_DECODE); cd->mreq = NULL; /* Check for next request in a pending queue. */ svc_vc_process_pending(xprt); if (cd->mreq == NULL || cd->resid != 0) { SOCKBUF_LOCK(&so->so_rcv); if (!soreadable(so)) xprt_inactive_self(xprt); SOCKBUF_UNLOCK(&so->so_rcv); } sx_xunlock(&xprt->xp_lock); if (! xdr_callmsg(&xdrs, msg)) { XDR_DESTROY(&xdrs); return (FALSE); } *addrp = NULL; *mp = xdrmbuf_getall(&xdrs); XDR_DESTROY(&xdrs); return (TRUE); } /* * The socket upcall calls xprt_active() which will eventually * cause the server to call us here. We attempt to * read as much as possible from the socket and put * the result in cd->mpending. If the read fails, * we have drained both cd->mpending and the socket so * we can call xprt_inactive(). */ uio.uio_resid = 1000000000; uio.uio_td = curthread; m = NULL; rcvflag = MSG_DONTWAIT; error = soreceive(so, NULL, &uio, &m, NULL, &rcvflag); if (error == EWOULDBLOCK) { /* * We must re-test for readability after * taking the lock to protect us in the case * where a new packet arrives on the socket * after our call to soreceive fails with * EWOULDBLOCK. */ SOCKBUF_LOCK(&so->so_rcv); if (!soreadable(so)) xprt_inactive_self(xprt); SOCKBUF_UNLOCK(&so->so_rcv); sx_xunlock(&xprt->xp_lock); return (FALSE); } if (error) { SOCKBUF_LOCK(&so->so_rcv); if (xprt->xp_upcallset) { xprt->xp_upcallset = 0; soupcall_clear(so, SO_RCV); } SOCKBUF_UNLOCK(&so->so_rcv); xprt_inactive_self(xprt); cd->strm_stat = XPRT_DIED; sx_xunlock(&xprt->xp_lock); return (FALSE); } if (!m) { /* * EOF - the other end has closed the socket. */ xprt_inactive_self(xprt); cd->strm_stat = XPRT_DIED; sx_xunlock(&xprt->xp_lock); return (FALSE); } if (cd->mpending) m_last(cd->mpending)->m_next = m; else cd->mpending = m; } } static bool_t svc_vc_backchannel_recv(SVCXPRT *xprt, struct rpc_msg *msg, struct sockaddr **addrp, struct mbuf **mp) { struct cf_conn *cd = (struct cf_conn *) xprt->xp_p1; struct ct_data *ct; struct mbuf *m; XDR xdrs; sx_xlock(&xprt->xp_lock); ct = (struct ct_data *)xprt->xp_p2; if (ct == NULL) { sx_xunlock(&xprt->xp_lock); return (FALSE); } mtx_lock(&ct->ct_lock); m = cd->mreq; if (m == NULL) { xprt_inactive_self(xprt); mtx_unlock(&ct->ct_lock); sx_xunlock(&xprt->xp_lock); return (FALSE); } cd->mreq = m->m_nextpkt; mtx_unlock(&ct->ct_lock); sx_xunlock(&xprt->xp_lock); xdrmbuf_create(&xdrs, m, XDR_DECODE); if (! xdr_callmsg(&xdrs, msg)) { XDR_DESTROY(&xdrs); return (FALSE); } *addrp = NULL; *mp = xdrmbuf_getall(&xdrs); XDR_DESTROY(&xdrs); return (TRUE); } static bool_t svc_vc_reply(SVCXPRT *xprt, struct rpc_msg *msg, struct sockaddr *addr, struct mbuf *m, uint32_t *seq) { XDR xdrs; struct mbuf *mrep; bool_t stat = TRUE; int error, len; /* * Leave space for record mark. */ mrep = m_gethdr(M_WAITOK, MT_DATA); mrep->m_data += sizeof(uint32_t); xdrmbuf_create(&xdrs, mrep, XDR_ENCODE); if (msg->rm_reply.rp_stat == MSG_ACCEPTED && msg->rm_reply.rp_acpt.ar_stat == SUCCESS) { if (!xdr_replymsg(&xdrs, msg)) stat = FALSE; else xdrmbuf_append(&xdrs, m); } else { stat = xdr_replymsg(&xdrs, msg); } if (stat) { m_fixhdr(mrep); /* * Prepend a record marker containing the reply length. */ M_PREPEND(mrep, sizeof(uint32_t), M_WAITOK); len = mrep->m_pkthdr.len; *mtod(mrep, uint32_t *) = htonl(0x80000000 | (len - sizeof(uint32_t))); atomic_add_32(&xprt->xp_snd_cnt, len); error = sosend(xprt->xp_socket, NULL, NULL, mrep, NULL, 0, curthread); if (!error) { atomic_add_rel_32(&xprt->xp_snt_cnt, len); if (seq) *seq = xprt->xp_snd_cnt; stat = TRUE; } else atomic_subtract_32(&xprt->xp_snd_cnt, len); } else { m_freem(mrep); } XDR_DESTROY(&xdrs); return (stat); } static bool_t svc_vc_backchannel_reply(SVCXPRT *xprt, struct rpc_msg *msg, struct sockaddr *addr, struct mbuf *m, uint32_t *seq) { struct ct_data *ct; XDR xdrs; struct mbuf *mrep; bool_t stat = TRUE; int error; /* * Leave space for record mark. */ mrep = m_gethdr(M_WAITOK, MT_DATA); mrep->m_data += sizeof(uint32_t); xdrmbuf_create(&xdrs, mrep, XDR_ENCODE); if (msg->rm_reply.rp_stat == MSG_ACCEPTED && msg->rm_reply.rp_acpt.ar_stat == SUCCESS) { if (!xdr_replymsg(&xdrs, msg)) stat = FALSE; else xdrmbuf_append(&xdrs, m); } else { stat = xdr_replymsg(&xdrs, msg); } if (stat) { m_fixhdr(mrep); /* * Prepend a record marker containing the reply length. */ M_PREPEND(mrep, sizeof(uint32_t), M_WAITOK); *mtod(mrep, uint32_t *) = htonl(0x80000000 | (mrep->m_pkthdr.len - sizeof(uint32_t))); sx_xlock(&xprt->xp_lock); ct = (struct ct_data *)xprt->xp_p2; if (ct != NULL) error = sosend(ct->ct_socket, NULL, NULL, mrep, NULL, 0, curthread); else error = EPIPE; sx_xunlock(&xprt->xp_lock); if (!error) { stat = TRUE; } } else { m_freem(mrep); } XDR_DESTROY(&xdrs); return (stat); } static bool_t svc_vc_null() { return (FALSE); } static int svc_vc_soupcall(struct socket *so, void *arg, int waitflag) { SVCXPRT *xprt = (SVCXPRT *) arg; if (soreadable(xprt->xp_socket)) + xprt_active(xprt); + return (SU_OK); +} + +static int +svc_vc_rendezvous_soupcall(struct socket *head, void *arg, int waitflag) +{ + SVCXPRT *xprt = (SVCXPRT *) arg; + + if (!TAILQ_EMPTY(&head->sol_comp)) xprt_active(xprt); return (SU_OK); } #if 0 /* * Get the effective UID of the sending process. Used by rpcbind, keyserv * and rpc.yppasswdd on AF_LOCAL. */ int __rpc_get_local_uid(SVCXPRT *transp, uid_t *uid) { int sock, ret; gid_t egid; uid_t euid; struct sockaddr *sa; sock = transp->xp_fd; sa = (struct sockaddr *)transp->xp_rtaddr; if (sa->sa_family == AF_LOCAL) { ret = getpeereid(sock, &euid, &egid); if (ret == 0) *uid = euid; return (ret); } else return (-1); } #endif Index: head/sys/sys/sockbuf.h =================================================================== --- head/sys/sys/sockbuf.h (revision 319721) +++ head/sys/sys/sockbuf.h (revision 319722) @@ -1,259 +1,259 @@ /*- * Copyright (c) 1982, 1986, 1990, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)socketvar.h 8.3 (Berkeley) 2/19/95 * * $FreeBSD$ */ #ifndef _SYS_SOCKBUF_H_ #define _SYS_SOCKBUF_H_ -#include /* for struct selinfo */ #include #include #include #include #define SB_MAX (2*1024*1024) /* default for max chars in sockbuf */ /* * Constants for sb_flags field of struct sockbuf. */ #define SB_WAIT 0x04 /* someone is waiting for data/space */ #define SB_SEL 0x08 /* someone is selecting */ #define SB_ASYNC 0x10 /* ASYNC I/O, need signals */ #define SB_UPCALL 0x20 /* someone wants an upcall */ #define SB_NOINTR 0x40 /* operations not interruptible */ #define SB_AIO 0x80 /* AIO operations queued */ #define SB_KNOTE 0x100 /* kernel note attached */ #define SB_NOCOALESCE 0x200 /* don't coalesce new data into existing mbufs */ #define SB_IN_TOE 0x400 /* socket buffer is in the middle of an operation */ #define SB_AUTOSIZE 0x800 /* automatically size socket buffer */ #define SB_STOP 0x1000 /* backpressure indicator */ #define SB_AIO_RUNNING 0x2000 /* AIO operation running */ #define SBS_CANTSENDMORE 0x0010 /* can't send more data to peer */ #define SBS_CANTRCVMORE 0x0020 /* can't receive more data from peer */ #define SBS_RCVATMARK 0x0040 /* at mark on input */ struct mbuf; struct sockaddr; struct socket; struct thread; +struct selinfo; struct xsockbuf { u_int sb_cc; u_int sb_hiwat; u_int sb_mbcnt; u_int sb_mcnt; u_int sb_ccnt; u_int sb_mbmax; int sb_lowat; int sb_timeo; short sb_flags; }; /* * Variables for socket buffering. * * Locking key to struct sockbuf: * (a) locked by SOCKBUF_LOCK(). */ struct sockbuf { - struct selinfo sb_sel; /* process selecting read/write */ - struct mtx sb_mtx; /* sockbuf lock */ - struct sx sb_sx; /* prevent I/O interlacing */ + struct mtx sb_mtx; /* sockbuf lock */ + struct sx sb_sx; /* prevent I/O interlacing */ + struct selinfo *sb_sel; /* process selecting read/write */ short sb_state; /* (a) socket state on sockbuf */ #define sb_startzero sb_mb struct mbuf *sb_mb; /* (a) the mbuf chain */ struct mbuf *sb_mbtail; /* (a) the last mbuf in the chain */ struct mbuf *sb_lastrecord; /* (a) first mbuf of last * record in socket buffer */ struct mbuf *sb_sndptr; /* (a) pointer into mbuf chain */ struct mbuf *sb_fnrdy; /* (a) pointer to first not ready buffer */ u_int sb_sndptroff; /* (a) byte offset of ptr into chain */ u_int sb_acc; /* (a) available chars in buffer */ u_int sb_ccc; /* (a) claimed chars in buffer */ u_int sb_hiwat; /* (a) max actual char count */ u_int sb_mbcnt; /* (a) chars of mbufs used */ u_int sb_mcnt; /* (a) number of mbufs in buffer */ u_int sb_ccnt; /* (a) number of clusters in buffer */ u_int sb_mbmax; /* (a) max chars of mbufs to use */ u_int sb_ctl; /* (a) non-data chars in buffer */ int sb_lowat; /* (a) low water mark */ sbintime_t sb_timeo; /* (a) timeout for read/write */ short sb_flags; /* (a) flags, see below */ int (*sb_upcall)(struct socket *, void *, int); /* (a) */ void *sb_upcallarg; /* (a) */ TAILQ_HEAD(, kaiocb) sb_aiojobq; /* (a) pending AIO ops */ struct task sb_aiotask; /* AIO task */ }; #ifdef _KERNEL /* * Per-socket buffer mutex used to protect most fields in the socket * buffer. */ #define SOCKBUF_MTX(_sb) (&(_sb)->sb_mtx) #define SOCKBUF_LOCK_INIT(_sb, _name) \ mtx_init(SOCKBUF_MTX(_sb), _name, NULL, MTX_DEF) #define SOCKBUF_LOCK_DESTROY(_sb) mtx_destroy(SOCKBUF_MTX(_sb)) #define SOCKBUF_LOCK(_sb) mtx_lock(SOCKBUF_MTX(_sb)) #define SOCKBUF_OWNED(_sb) mtx_owned(SOCKBUF_MTX(_sb)) #define SOCKBUF_UNLOCK(_sb) mtx_unlock(SOCKBUF_MTX(_sb)) #define SOCKBUF_LOCK_ASSERT(_sb) mtx_assert(SOCKBUF_MTX(_sb), MA_OWNED) #define SOCKBUF_UNLOCK_ASSERT(_sb) mtx_assert(SOCKBUF_MTX(_sb), MA_NOTOWNED) /* * Socket buffer private mbuf(9) flags. */ #define M_NOTREADY M_PROTO1 /* m_data not populated yet */ #define M_BLOCKED M_PROTO2 /* M_NOTREADY in front of m */ #define M_NOTAVAIL (M_NOTREADY | M_BLOCKED) void sbappend(struct sockbuf *sb, struct mbuf *m, int flags); void sbappend_locked(struct sockbuf *sb, struct mbuf *m, int flags); void sbappendstream(struct sockbuf *sb, struct mbuf *m, int flags); void sbappendstream_locked(struct sockbuf *sb, struct mbuf *m, int flags); int sbappendaddr(struct sockbuf *sb, const struct sockaddr *asa, struct mbuf *m0, struct mbuf *control); int sbappendaddr_locked(struct sockbuf *sb, const struct sockaddr *asa, struct mbuf *m0, struct mbuf *control); int sbappendaddr_nospacecheck_locked(struct sockbuf *sb, const struct sockaddr *asa, struct mbuf *m0, struct mbuf *control); int sbappendcontrol(struct sockbuf *sb, struct mbuf *m0, struct mbuf *control); int sbappendcontrol_locked(struct sockbuf *sb, struct mbuf *m0, struct mbuf *control); void sbappendrecord(struct sockbuf *sb, struct mbuf *m0); void sbappendrecord_locked(struct sockbuf *sb, struct mbuf *m0); void sbcompress(struct sockbuf *sb, struct mbuf *m, struct mbuf *n); struct mbuf * sbcreatecontrol(caddr_t p, int size, int type, int level); void sbdestroy(struct sockbuf *sb, struct socket *so); void sbdrop(struct sockbuf *sb, int len); void sbdrop_locked(struct sockbuf *sb, int len); struct mbuf * sbcut_locked(struct sockbuf *sb, int len); void sbdroprecord(struct sockbuf *sb); void sbdroprecord_locked(struct sockbuf *sb); void sbflush(struct sockbuf *sb); void sbflush_locked(struct sockbuf *sb); void sbrelease(struct sockbuf *sb, struct socket *so); void sbrelease_internal(struct sockbuf *sb, struct socket *so); void sbrelease_locked(struct sockbuf *sb, struct socket *so); int sbreserve(struct sockbuf *sb, u_long cc, struct socket *so, struct thread *td); int sbreserve_locked(struct sockbuf *sb, u_long cc, struct socket *so, struct thread *td); struct mbuf * sbsndptr(struct sockbuf *sb, u_int off, u_int len, u_int *moff); struct mbuf * sbsndmbuf(struct sockbuf *sb, u_int off, u_int *moff); void sbtoxsockbuf(struct sockbuf *sb, struct xsockbuf *xsb); int sbwait(struct sockbuf *sb); int sblock(struct sockbuf *sb, int flags); void sbunlock(struct sockbuf *sb); void sballoc(struct sockbuf *, struct mbuf *); void sbfree(struct sockbuf *, struct mbuf *); int sbready(struct sockbuf *, struct mbuf *, int); /* * Return how much data is available to be taken out of socket * buffer right now. */ static inline u_int sbavail(struct sockbuf *sb) { #if 0 SOCKBUF_LOCK_ASSERT(sb); #endif return (sb->sb_acc); } /* * Return how much data sits there in the socket buffer * It might be that some data is not yet ready to be read. */ static inline u_int sbused(struct sockbuf *sb) { #if 0 SOCKBUF_LOCK_ASSERT(sb); #endif return (sb->sb_ccc); } /* * How much space is there in a socket buffer (so->so_snd or so->so_rcv)? * This is problematical if the fields are unsigned, as the space might * still be negative (ccc > hiwat or mbcnt > mbmax). */ static inline long sbspace(struct sockbuf *sb) { int bleft, mleft; /* size should match sockbuf fields */ #if 0 SOCKBUF_LOCK_ASSERT(sb); #endif if (sb->sb_flags & SB_STOP) return(0); bleft = sb->sb_hiwat - sb->sb_ccc; mleft = sb->sb_mbmax - sb->sb_mbcnt; return ((bleft < mleft) ? bleft : mleft); } #define SB_EMPTY_FIXUP(sb) do { \ if ((sb)->sb_mb == NULL) { \ (sb)->sb_mbtail = NULL; \ (sb)->sb_lastrecord = NULL; \ } \ } while (/*CONSTCOND*/0) #ifdef SOCKBUF_DEBUG void sblastrecordchk(struct sockbuf *, const char *, int); void sblastmbufchk(struct sockbuf *, const char *, int); void sbcheck(struct sockbuf *, const char *, int); #define SBLASTRECORDCHK(sb) sblastrecordchk((sb), __FILE__, __LINE__) #define SBLASTMBUFCHK(sb) sblastmbufchk((sb), __FILE__, __LINE__) #define SBCHECK(sb) sbcheck((sb), __FILE__, __LINE__) #else #define SBLASTRECORDCHK(sb) do {} while (0) #define SBLASTMBUFCHK(sb) do {} while (0) #define SBCHECK(sb) do {} while (0) #endif /* SOCKBUF_DEBUG */ #endif /* _KERNEL */ #endif /* _SYS_SOCKBUF_H_ */ Index: head/sys/sys/socket.h =================================================================== --- head/sys/sys/socket.h (revision 319721) +++ head/sys/sys/socket.h (revision 319722) @@ -1,712 +1,716 @@ /*- * Copyright (c) 1982, 1985, 1986, 1988, 1993, 1994 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)socket.h 8.4 (Berkeley) 2/21/94 * $FreeBSD$ */ #ifndef _SYS_SOCKET_H_ #define _SYS_SOCKET_H_ #include #include #include #include /* * Definitions related to sockets: types, address families, options. */ /* * Data types. */ #if __BSD_VISIBLE #ifndef _GID_T_DECLARED typedef __gid_t gid_t; #define _GID_T_DECLARED #endif #ifndef _OFF_T_DECLARED typedef __off_t off_t; #define _OFF_T_DECLARED #endif #ifndef _PID_T_DECLARED typedef __pid_t pid_t; #define _PID_T_DECLARED #endif #endif #ifndef _SA_FAMILY_T_DECLARED typedef __sa_family_t sa_family_t; #define _SA_FAMILY_T_DECLARED #endif #ifndef _SOCKLEN_T_DECLARED typedef __socklen_t socklen_t; #define _SOCKLEN_T_DECLARED #endif #ifndef _SSIZE_T_DECLARED typedef __ssize_t ssize_t; #define _SSIZE_T_DECLARED #endif #if __BSD_VISIBLE #ifndef _UID_T_DECLARED typedef __uid_t uid_t; #define _UID_T_DECLARED #endif #endif #ifndef _UINT32_T_DECLARED typedef __uint32_t uint32_t; #define _UINT32_T_DECLARED #endif #ifndef _UINTPTR_T_DECLARED typedef __uintptr_t uintptr_t; #define _UINTPTR_T_DECLARED #endif /* * Types */ #define SOCK_STREAM 1 /* stream socket */ #define SOCK_DGRAM 2 /* datagram socket */ #define SOCK_RAW 3 /* raw-protocol interface */ #if __BSD_VISIBLE #define SOCK_RDM 4 /* reliably-delivered message */ #endif #define SOCK_SEQPACKET 5 /* sequenced packet stream */ #if __BSD_VISIBLE /* * Creation flags, OR'ed into socket() and socketpair() type argument. */ #define SOCK_CLOEXEC 0x10000000 #define SOCK_NONBLOCK 0x20000000 -#endif +#ifdef _KERNEL +/* + * Flags for accept1(), kern_accept4() and solisten_dequeue, in addition + * to SOCK_CLOEXEC and SOCK_NONBLOCK. + */ +#define ACCEPT4_INHERIT 0x1 +#define ACCEPT4_COMPAT 0x2 +#endif /* _KERNEL */ +#endif /* __BSD_VISIBLE */ /* * Option flags per-socket. */ #define SO_DEBUG 0x0001 /* turn on debugging info recording */ #define SO_ACCEPTCONN 0x0002 /* socket has had listen() */ #define SO_REUSEADDR 0x0004 /* allow local address reuse */ #define SO_KEEPALIVE 0x0008 /* keep connections alive */ #define SO_DONTROUTE 0x0010 /* just use interface addresses */ #define SO_BROADCAST 0x0020 /* permit sending of broadcast msgs */ #if __BSD_VISIBLE #define SO_USELOOPBACK 0x0040 /* bypass hardware when possible */ #endif #define SO_LINGER 0x0080 /* linger on close if data present */ #define SO_OOBINLINE 0x0100 /* leave received OOB data in line */ #if __BSD_VISIBLE #define SO_REUSEPORT 0x0200 /* allow local address & port reuse */ #define SO_TIMESTAMP 0x0400 /* timestamp received dgram traffic */ #define SO_NOSIGPIPE 0x0800 /* no SIGPIPE from EPIPE */ #define SO_ACCEPTFILTER 0x1000 /* there is an accept filter */ #define SO_BINTIME 0x2000 /* timestamp received dgram traffic */ #endif #define SO_NO_OFFLOAD 0x4000 /* socket cannot be offloaded */ #define SO_NO_DDP 0x8000 /* disable direct data placement */ /* * Additional options, not kept in so_options. */ #define SO_SNDBUF 0x1001 /* send buffer size */ #define SO_RCVBUF 0x1002 /* receive buffer size */ #define SO_SNDLOWAT 0x1003 /* send low-water mark */ #define SO_RCVLOWAT 0x1004 /* receive low-water mark */ #define SO_SNDTIMEO 0x1005 /* send timeout */ #define SO_RCVTIMEO 0x1006 /* receive timeout */ #define SO_ERROR 0x1007 /* get error status and clear */ #define SO_TYPE 0x1008 /* get socket type */ #if __BSD_VISIBLE #define SO_LABEL 0x1009 /* socket's MAC label */ #define SO_PEERLABEL 0x1010 /* socket's peer's MAC label */ #define SO_LISTENQLIMIT 0x1011 /* socket's backlog limit */ #define SO_LISTENQLEN 0x1012 /* socket's complete queue length */ #define SO_LISTENINCQLEN 0x1013 /* socket's incomplete queue length */ #define SO_SETFIB 0x1014 /* use this FIB to route */ #define SO_USER_COOKIE 0x1015 /* user cookie (dummynet etc.) */ #define SO_PROTOCOL 0x1016 /* get socket protocol (Linux name) */ #define SO_PROTOTYPE SO_PROTOCOL /* alias for SO_PROTOCOL (SunOS name) */ #define SO_TS_CLOCK 0x1017 /* clock type used for SO_TIMESTAMP */ #define SO_MAX_PACING_RATE 0x1018 /* socket's max TX pacing rate (Linux name) */ #endif #if __BSD_VISIBLE #define SO_TS_REALTIME_MICRO 0 /* microsecond resolution, realtime */ #define SO_TS_BINTIME 1 /* sub-nanosecond resolution, realtime */ #define SO_TS_REALTIME 2 /* nanosecond resolution, realtime */ #define SO_TS_MONOTONIC 3 /* nanosecond resolution, monotonic */ #define SO_TS_DEFAULT SO_TS_REALTIME_MICRO #define SO_TS_CLOCK_MAX SO_TS_MONOTONIC #endif /* * Space reserved for new socket options added by third-party vendors. * This range applies to all socket option levels. New socket options * in FreeBSD should always use an option value less than SO_VENDOR. */ #if __BSD_VISIBLE #define SO_VENDOR 0x80000000 #endif /* * Structure used for manipulating linger option. */ struct linger { int l_onoff; /* option on/off */ int l_linger; /* linger time */ }; #if __BSD_VISIBLE struct accept_filter_arg { char af_name[16]; char af_arg[256-16]; }; #endif /* * Level number for (get/set)sockopt() to apply to socket itself. */ #define SOL_SOCKET 0xffff /* options for socket level */ /* * Address families. */ #define AF_UNSPEC 0 /* unspecified */ #if __BSD_VISIBLE #define AF_LOCAL AF_UNIX /* local to host (pipes, portals) */ #endif #define AF_UNIX 1 /* standardized name for AF_LOCAL */ #define AF_INET 2 /* internetwork: UDP, TCP, etc. */ #if __BSD_VISIBLE #define AF_IMPLINK 3 /* arpanet imp addresses */ #define AF_PUP 4 /* pup protocols: e.g. BSP */ #define AF_CHAOS 5 /* mit CHAOS protocols */ #define AF_NETBIOS 6 /* SMB protocols */ #define AF_ISO 7 /* ISO protocols */ #define AF_OSI AF_ISO #define AF_ECMA 8 /* European computer manufacturers */ #define AF_DATAKIT 9 /* datakit protocols */ #define AF_CCITT 10 /* CCITT protocols, X.25 etc */ #define AF_SNA 11 /* IBM SNA */ #define AF_DECnet 12 /* DECnet */ #define AF_DLI 13 /* DEC Direct data link interface */ #define AF_LAT 14 /* LAT */ #define AF_HYLINK 15 /* NSC Hyperchannel */ #define AF_APPLETALK 16 /* Apple Talk */ #define AF_ROUTE 17 /* Internal Routing Protocol */ #define AF_LINK 18 /* Link layer interface */ #define pseudo_AF_XTP 19 /* eXpress Transfer Protocol (no AF) */ #define AF_COIP 20 /* connection-oriented IP, aka ST II */ #define AF_CNT 21 /* Computer Network Technology */ #define pseudo_AF_RTIP 22 /* Help Identify RTIP packets */ #define AF_IPX 23 /* Novell Internet Protocol */ #define AF_SIP 24 /* Simple Internet Protocol */ #define pseudo_AF_PIP 25 /* Help Identify PIP packets */ #define AF_ISDN 26 /* Integrated Services Digital Network*/ #define AF_E164 AF_ISDN /* CCITT E.164 recommendation */ #define pseudo_AF_KEY 27 /* Internal key-management function */ #endif #define AF_INET6 28 /* IPv6 */ #if __BSD_VISIBLE #define AF_NATM 29 /* native ATM access */ #define AF_ATM 30 /* ATM */ #define pseudo_AF_HDRCMPLT 31 /* Used by BPF to not rewrite headers * in interface output routine */ #define AF_NETGRAPH 32 /* Netgraph sockets */ #define AF_SLOW 33 /* 802.3ad slow protocol */ #define AF_SCLUSTER 34 /* Sitara cluster protocol */ #define AF_ARP 35 #define AF_BLUETOOTH 36 /* Bluetooth sockets */ #define AF_IEEE80211 37 /* IEEE 802.11 protocol */ #define AF_INET_SDP 40 /* OFED Socket Direct Protocol ipv4 */ #define AF_INET6_SDP 42 /* OFED Socket Direct Protocol ipv6 */ #define AF_MAX 42 /* * When allocating a new AF_ constant, please only allocate * even numbered constants for FreeBSD until 134 as odd numbered AF_ * constants 39-133 are now reserved for vendors. */ #define AF_VENDOR00 39 #define AF_VENDOR01 41 #define AF_VENDOR02 43 #define AF_VENDOR03 45 #define AF_VENDOR04 47 #define AF_VENDOR05 49 #define AF_VENDOR06 51 #define AF_VENDOR07 53 #define AF_VENDOR08 55 #define AF_VENDOR09 57 #define AF_VENDOR10 59 #define AF_VENDOR11 61 #define AF_VENDOR12 63 #define AF_VENDOR13 65 #define AF_VENDOR14 67 #define AF_VENDOR15 69 #define AF_VENDOR16 71 #define AF_VENDOR17 73 #define AF_VENDOR18 75 #define AF_VENDOR19 77 #define AF_VENDOR20 79 #define AF_VENDOR21 81 #define AF_VENDOR22 83 #define AF_VENDOR23 85 #define AF_VENDOR24 87 #define AF_VENDOR25 89 #define AF_VENDOR26 91 #define AF_VENDOR27 93 #define AF_VENDOR28 95 #define AF_VENDOR29 97 #define AF_VENDOR30 99 #define AF_VENDOR31 101 #define AF_VENDOR32 103 #define AF_VENDOR33 105 #define AF_VENDOR34 107 #define AF_VENDOR35 109 #define AF_VENDOR36 111 #define AF_VENDOR37 113 #define AF_VENDOR38 115 #define AF_VENDOR39 117 #define AF_VENDOR40 119 #define AF_VENDOR41 121 #define AF_VENDOR42 123 #define AF_VENDOR43 125 #define AF_VENDOR44 127 #define AF_VENDOR45 129 #define AF_VENDOR46 131 #define AF_VENDOR47 133 #endif /* * Structure used by kernel to store most * addresses. */ struct sockaddr { unsigned char sa_len; /* total length */ sa_family_t sa_family; /* address family */ char sa_data[14]; /* actually longer; address value */ }; #if __BSD_VISIBLE #define SOCK_MAXADDRLEN 255 /* longest possible addresses */ /* * Structure used by kernel to pass protocol * information in raw sockets. */ struct sockproto { unsigned short sp_family; /* address family */ unsigned short sp_protocol; /* protocol */ }; #endif #include #if __BSD_VISIBLE /* * Protocol families, same as address families for now. */ #define PF_UNSPEC AF_UNSPEC #define PF_LOCAL AF_LOCAL #define PF_UNIX PF_LOCAL /* backward compatibility */ #define PF_INET AF_INET #define PF_IMPLINK AF_IMPLINK #define PF_PUP AF_PUP #define PF_CHAOS AF_CHAOS #define PF_NETBIOS AF_NETBIOS #define PF_ISO AF_ISO #define PF_OSI AF_ISO #define PF_ECMA AF_ECMA #define PF_DATAKIT AF_DATAKIT #define PF_CCITT AF_CCITT #define PF_SNA AF_SNA #define PF_DECnet AF_DECnet #define PF_DLI AF_DLI #define PF_LAT AF_LAT #define PF_HYLINK AF_HYLINK #define PF_APPLETALK AF_APPLETALK #define PF_ROUTE AF_ROUTE #define PF_LINK AF_LINK #define PF_XTP pseudo_AF_XTP /* really just proto family, no AF */ #define PF_COIP AF_COIP #define PF_CNT AF_CNT #define PF_SIP AF_SIP #define PF_IPX AF_IPX #define PF_RTIP pseudo_AF_RTIP /* same format as AF_INET */ #define PF_PIP pseudo_AF_PIP #define PF_ISDN AF_ISDN #define PF_KEY pseudo_AF_KEY #define PF_INET6 AF_INET6 #define PF_NATM AF_NATM #define PF_ATM AF_ATM #define PF_NETGRAPH AF_NETGRAPH #define PF_SLOW AF_SLOW #define PF_SCLUSTER AF_SCLUSTER #define PF_ARP AF_ARP #define PF_BLUETOOTH AF_BLUETOOTH #define PF_IEEE80211 AF_IEEE80211 #define PF_INET_SDP AF_INET_SDP #define PF_INET6_SDP AF_INET6_SDP #define PF_MAX AF_MAX /* * Definitions for network related sysctl, CTL_NET. * * Second level is protocol family. * Third level is protocol number. * * Further levels are defined by the individual families. */ /* * PF_ROUTE - Routing table * * Three additional levels are defined: * Fourth: address family, 0 is wildcard * Fifth: type of info, defined below * Sixth: flag(s) to mask with for NET_RT_FLAGS */ #define NET_RT_DUMP 1 /* dump; may limit to a.f. */ #define NET_RT_FLAGS 2 /* by flags, e.g. RESOLVING */ #define NET_RT_IFLIST 3 /* survey interface list */ #define NET_RT_IFMALIST 4 /* return multicast address list */ #define NET_RT_IFLISTL 5 /* Survey interface list, using 'l'en * versions of msghdr structs. */ #endif /* __BSD_VISIBLE */ /* * Maximum queue length specifiable by listen. */ #define SOMAXCONN 128 /* * Message header for recvmsg and sendmsg calls. * Used value-result for recvmsg, value only for sendmsg. */ struct msghdr { void *msg_name; /* optional address */ socklen_t msg_namelen; /* size of address */ struct iovec *msg_iov; /* scatter/gather array */ int msg_iovlen; /* # elements in msg_iov */ void *msg_control; /* ancillary data, see below */ socklen_t msg_controllen; /* ancillary data buffer len */ int msg_flags; /* flags on received message */ }; #define MSG_OOB 0x00000001 /* process out-of-band data */ #define MSG_PEEK 0x00000002 /* peek at incoming message */ #define MSG_DONTROUTE 0x00000004 /* send without using routing tables */ #define MSG_EOR 0x00000008 /* data completes record */ #define MSG_TRUNC 0x00000010 /* data discarded before delivery */ #define MSG_CTRUNC 0x00000020 /* control data lost before delivery */ #define MSG_WAITALL 0x00000040 /* wait for full request or error */ #if __BSD_VISIBLE #define MSG_DONTWAIT 0x00000080 /* this message should be nonblocking */ #define MSG_EOF 0x00000100 /* data completes connection */ /* 0x00000200 unused */ /* 0x00000400 unused */ /* 0x00000800 unused */ /* 0x00001000 unused */ #define MSG_NOTIFICATION 0x00002000 /* SCTP notification */ #define MSG_NBIO 0x00004000 /* FIONBIO mode, used by fifofs */ #define MSG_COMPAT 0x00008000 /* used in sendit() */ #endif #ifdef _KERNEL #define MSG_SOCALLBCK 0x00010000 /* for use by socket callbacks - soreceive (TCP) */ #endif #if __POSIX_VISIBLE >= 200809 #define MSG_NOSIGNAL 0x00020000 /* do not generate SIGPIPE on EOF */ #endif #if __BSD_VISIBLE #define MSG_CMSG_CLOEXEC 0x00040000 /* make received fds close-on-exec */ #define MSG_WAITFORONE 0x00080000 /* for recvmmsg() */ #endif #ifdef _KERNEL #define MSG_MORETOCOME 0x00100000 /* additional data pending */ #endif /* * Header for ancillary data objects in msg_control buffer. * Used for additional information with/about a datagram * not expressible by flags. The format is a sequence * of message elements headed by cmsghdr structures. */ struct cmsghdr { socklen_t cmsg_len; /* data byte count, including hdr */ int cmsg_level; /* originating protocol */ int cmsg_type; /* protocol-specific type */ /* followed by u_char cmsg_data[]; */ }; #if __BSD_VISIBLE /* * While we may have more groups than this, the cmsgcred struct must * be able to fit in an mbuf and we have historically supported a * maximum of 16 groups. */ #define CMGROUP_MAX 16 /* * Credentials structure, used to verify the identity of a peer * process that has sent us a message. This is allocated by the * peer process but filled in by the kernel. This prevents the * peer from lying about its identity. (Note that cmcred_groups[0] * is the effective GID.) */ struct cmsgcred { pid_t cmcred_pid; /* PID of sending process */ uid_t cmcred_uid; /* real UID of sending process */ uid_t cmcred_euid; /* effective UID of sending process */ gid_t cmcred_gid; /* real GID of sending process */ short cmcred_ngroups; /* number or groups */ gid_t cmcred_groups[CMGROUP_MAX]; /* groups */ }; /* * Socket credentials. */ struct sockcred { uid_t sc_uid; /* real user id */ uid_t sc_euid; /* effective user id */ gid_t sc_gid; /* real group id */ gid_t sc_egid; /* effective group id */ int sc_ngroups; /* number of supplemental groups */ gid_t sc_groups[1]; /* variable length */ }; /* * Compute size of a sockcred structure with groups. */ #define SOCKCREDSIZE(ngrps) \ (sizeof(struct sockcred) + (sizeof(gid_t) * ((ngrps) - 1))) #endif /* __BSD_VISIBLE */ /* given pointer to struct cmsghdr, return pointer to data */ #define CMSG_DATA(cmsg) ((unsigned char *)(cmsg) + \ _ALIGN(sizeof(struct cmsghdr))) /* given pointer to struct cmsghdr, return pointer to next cmsghdr */ #define CMSG_NXTHDR(mhdr, cmsg) \ ((char *)(cmsg) == (char *)0 ? CMSG_FIRSTHDR(mhdr) : \ ((char *)(cmsg) + _ALIGN(((struct cmsghdr *)(cmsg))->cmsg_len) + \ _ALIGN(sizeof(struct cmsghdr)) > \ (char *)(mhdr)->msg_control + (mhdr)->msg_controllen) ? \ (struct cmsghdr *)0 : \ (struct cmsghdr *)(void *)((char *)(cmsg) + \ _ALIGN(((struct cmsghdr *)(cmsg))->cmsg_len))) /* * RFC 2292 requires to check msg_controllen, in case that the kernel returns * an empty list for some reasons. */ #define CMSG_FIRSTHDR(mhdr) \ ((mhdr)->msg_controllen >= sizeof(struct cmsghdr) ? \ (struct cmsghdr *)(mhdr)->msg_control : \ (struct cmsghdr *)0) #if __BSD_VISIBLE /* RFC 2292 additions */ #define CMSG_SPACE(l) (_ALIGN(sizeof(struct cmsghdr)) + _ALIGN(l)) #define CMSG_LEN(l) (_ALIGN(sizeof(struct cmsghdr)) + (l)) #endif #ifdef _KERNEL #define CMSG_ALIGN(n) _ALIGN(n) #endif /* "Socket"-level control message types: */ #define SCM_RIGHTS 0x01 /* access rights (array of int) */ #if __BSD_VISIBLE #define SCM_TIMESTAMP 0x02 /* timestamp (struct timeval) */ #define SCM_CREDS 0x03 /* process creds (struct cmsgcred) */ #define SCM_BINTIME 0x04 /* timestamp (struct bintime) */ #define SCM_REALTIME 0x05 /* timestamp (struct timespec) */ #define SCM_MONOTONIC 0x06 /* timestamp (struct timespec) */ #endif #if __BSD_VISIBLE /* * 4.3 compat sockaddr, move to compat file later */ struct osockaddr { unsigned short sa_family; /* address family */ char sa_data[14]; /* up to 14 bytes of direct address */ }; /* * 4.3-compat message header (move to compat file later). */ struct omsghdr { char *msg_name; /* optional address */ int msg_namelen; /* size of address */ struct iovec *msg_iov; /* scatter/gather array */ int msg_iovlen; /* # elements in msg_iov */ char *msg_accrights; /* access rights sent/received */ int msg_accrightslen; }; #endif /* * howto arguments for shutdown(2), specified by Posix.1g. */ #define SHUT_RD 0 /* shut down the reading side */ #define SHUT_WR 1 /* shut down the writing side */ #define SHUT_RDWR 2 /* shut down both sides */ #if __BSD_VISIBLE /* for SCTP */ /* we cheat and use the SHUT_XX defines for these */ #define PRU_FLUSH_RD SHUT_RD #define PRU_FLUSH_WR SHUT_WR #define PRU_FLUSH_RDWR SHUT_RDWR #endif #if __BSD_VISIBLE /* * sendfile(2) header/trailer struct */ struct sf_hdtr { struct iovec *headers; /* pointer to an array of header struct iovec's */ int hdr_cnt; /* number of header iovec's */ struct iovec *trailers; /* pointer to an array of trailer struct iovec's */ int trl_cnt; /* number of trailer iovec's */ }; /* * Sendfile-specific flag(s) */ #define SF_NODISKIO 0x00000001 #define SF_MNOWAIT 0x00000002 /* obsolete */ #define SF_SYNC 0x00000004 #define SF_USER_READAHEAD 0x00000008 #define SF_NOCACHE 0x00000010 #define SF_FLAGS(rh, flags) (((rh) << 16) | (flags)) #ifdef _KERNEL #define SF_READAHEAD(flags) ((flags) >> 16) #endif /* _KERNEL */ /* * Sendmmsg/recvmmsg specific structure(s) */ struct mmsghdr { struct msghdr msg_hdr; /* message header */ ssize_t msg_len; /* message length */ }; #endif /* __BSD_VISIBLE */ #ifndef _KERNEL #include __BEGIN_DECLS int accept(int, struct sockaddr * __restrict, socklen_t * __restrict); int bind(int, const struct sockaddr *, socklen_t); int connect(int, const struct sockaddr *, socklen_t); #if __BSD_VISIBLE int accept4(int, struct sockaddr * __restrict, socklen_t * __restrict, int); int bindat(int, int, const struct sockaddr *, socklen_t); int connectat(int, int, const struct sockaddr *, socklen_t); #endif int getpeername(int, struct sockaddr * __restrict, socklen_t * __restrict); int getsockname(int, struct sockaddr * __restrict, socklen_t * __restrict); int getsockopt(int, int, int, void * __restrict, socklen_t * __restrict); int listen(int, int); ssize_t recv(int, void *, size_t, int); ssize_t recvfrom(int, void *, size_t, int, struct sockaddr * __restrict, socklen_t * __restrict); ssize_t recvmsg(int, struct msghdr *, int); #if __BSD_VISIBLE struct timespec; ssize_t recvmmsg(int, struct mmsghdr * __restrict, size_t, int, const struct timespec * __restrict); #endif ssize_t send(int, const void *, size_t, int); ssize_t sendto(int, const void *, size_t, int, const struct sockaddr *, socklen_t); ssize_t sendmsg(int, const struct msghdr *, int); #if __BSD_VISIBLE int sendfile(int, int, off_t, size_t, struct sf_hdtr *, off_t *, int); ssize_t sendmmsg(int, struct mmsghdr * __restrict, size_t, int); int setfib(int); #endif int setsockopt(int, int, int, const void *, socklen_t); int shutdown(int, int); int sockatmark(int); int socket(int, int, int); int socketpair(int, int, int, int *); __END_DECLS #endif /* !_KERNEL */ #ifdef _KERNEL struct socket; struct tcpcb *so_sototcpcb(struct socket *so); struct inpcb *so_sotoinpcb(struct socket *so); struct sockbuf *so_sockbuf_snd(struct socket *); struct sockbuf *so_sockbuf_rcv(struct socket *); int so_state_get(const struct socket *); void so_state_set(struct socket *, int); int so_options_get(const struct socket *); void so_options_set(struct socket *, int); int so_error_get(const struct socket *); void so_error_set(struct socket *, int); int so_linger_get(const struct socket *); void so_linger_set(struct socket *, int); struct protosw *so_protosw_get(const struct socket *); void so_protosw_set(struct socket *, struct protosw *); void so_sorwakeup_locked(struct socket *so); void so_sowwakeup_locked(struct socket *so); void so_sorwakeup(struct socket *so); void so_sowwakeup(struct socket *so); void so_lock(struct socket *so); void so_unlock(struct socket *so); -void so_listeners_apply_all(struct socket *so, void (*func)(struct socket *, void *), void *arg); - -#endif - - +#endif /* _KERNEL */ #endif /* !_SYS_SOCKET_H_ */ Index: head/sys/sys/socketvar.h =================================================================== --- head/sys/sys/socketvar.h (revision 319721) +++ head/sys/sys/socketvar.h (revision 319722) @@ -1,426 +1,450 @@ /*- * Copyright (c) 1982, 1986, 1990, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)socketvar.h 8.3 (Berkeley) 2/19/95 * * $FreeBSD$ */ #ifndef _SYS_SOCKETVAR_H_ #define _SYS_SOCKETVAR_H_ #include /* for TAILQ macros */ #include /* for struct selinfo */ #include #include #include #include #include #include #ifdef _KERNEL #include #include #endif struct vnet; /* * Kernel structure per socket. * Contains send and receive buffer queues, * handle on protocol and pointer to protocol * private data and error information. */ typedef uint64_t so_gen_t; typedef int so_upcall_t(struct socket *, void *, int); struct socket; /*- * Locking key to struct socket: * (a) constant after allocation, no locking required. * (b) locked by SOCK_LOCK(so). - * (c) locked by SOCKBUF_LOCK(&so->so_rcv). - * (e) locked by ACCEPT_LOCK(). + * (cr) locked by SOCKBUF_LOCK(&so->so_rcv). + * (cs) locked by SOCKBUF_LOCK(&so->so_rcv). + * (e) locked by SOLISTEN_LOCK() of corresponding listening socket. * (f) not locked since integer reads/writes are atomic. * (g) used only as a sleep/wakeup address, no value. * (h) locked by global mutex so_global_mtx. */ +TAILQ_HEAD(accept_queue, socket); struct socket { - int so_count; /* (b) reference count */ + struct mtx so_lock; + volatile u_int so_count; /* (b / refcount) */ + struct selinfo so_rdsel; /* (b/cr) for so_rcv/so_comp */ + struct selinfo so_wrsel; /* (b/cs) for so_snd */ short so_type; /* (a) generic type, see socket.h */ - short so_options; /* from socket call, see socket.h */ - short so_linger; /* time to linger while closing */ + short so_options; /* (b) from socket call, see socket.h */ + short so_linger; /* time to linger close(2) */ short so_state; /* (b) internal state flags SS_* */ - int so_qstate; /* (e) internal state flags SQ_* */ void *so_pcb; /* protocol control block */ struct vnet *so_vnet; /* (a) network stack instance */ struct protosw *so_proto; /* (a) protocol handle */ -/* - * Variables for connection queuing. - * Socket where accepts occur is so_head in all subsidiary sockets. - * If so_head is 0, socket is not related to an accept. - * For head socket so_incomp queues partially completed connections, - * while so_comp is a queue of connections ready to be accepted. - * If a connection is aborted and it has so_head set, then - * it has to be pulled out of either so_incomp or so_comp. - * We allow connections to queue up based on current queue lengths - * and limit on number of queued connections for this socket. - */ - struct socket *so_head; /* (e) back pointer to listen socket */ - TAILQ_HEAD(, socket) so_incomp; /* (e) queue of partial unaccepted connections */ - TAILQ_HEAD(, socket) so_comp; /* (e) queue of complete unaccepted connections */ - TAILQ_ENTRY(socket) so_list; /* (e) list of unaccepted connections */ - u_int so_qlen; /* (e) number of unaccepted connections */ - u_int so_incqlen; /* (e) number of unaccepted incomplete - connections */ - u_int so_qlimit; /* (e) max number queued connections */ short so_timeo; /* (g) connection timeout */ u_short so_error; /* (f) error affecting connection */ struct sigio *so_sigio; /* [sg] information for async I/O or out of band data (SIGURG) */ - u_long so_oobmark; /* (c) chars to oob mark */ - - struct sockbuf so_rcv, so_snd; - struct ucred *so_cred; /* (a) user credentials */ struct label *so_label; /* (b) MAC label for socket */ - struct label *so_peerlabel; /* (b) cached MAC label for peer */ /* NB: generation count must not be first. */ so_gen_t so_gencnt; /* (h) generation count */ void *so_emuldata; /* (b) private data for emulators */ - struct so_accf { - struct accept_filter *so_accept_filter; - void *so_accept_filter_arg; /* saved filter args */ - char *so_accept_filter_str; /* saved user args */ - } *so_accf; struct osd osd; /* Object Specific extensions */ /* * so_fibnum, so_user_cookie and friends can be used to attach * some user-specified metadata to a socket, which then can be * used by the kernel for various actions. * so_user_cookie is used by ipfw/dummynet. */ int so_fibnum; /* routing domain for this socket */ uint32_t so_user_cookie; int so_ts_clock; /* type of the clock used for timestamps */ uint32_t so_max_pacing_rate; /* (f) TX rate limit in bytes/s */ + union { + /* Regular (data flow) socket. */ + struct { + /* (cr, cs) Receive and send buffers. */ + struct sockbuf so_rcv, so_snd; - void *so_pspare[2]; /* general use */ - int so_ispare[2]; /* general use */ + /* (e) Our place on accept queue. */ + TAILQ_ENTRY(socket) so_list; + struct socket *so_listen; /* (b) */ + enum { + SQ_NONE = 0, + SQ_INCOMP = 0x0800, /* on sol_incomp */ + SQ_COMP = 0x1000, /* on sol_comp */ + } so_qstate; /* (b) */ + + /* (b) cached MAC label for peer */ + struct label *so_peerlabel; + u_long so_oobmark; /* chars to oob mark */ + }; + /* + * Listening socket, where accepts occur, is so_listen in all + * subsidiary sockets. If so_listen is NULL, socket is not + * related to an accept. For a listening socket itself + * sol_incomp queues partially completed connections, while + * sol_comp is a queue of connections ready to be accepted. + * If a connection is aborted and it has so_listen set, then + * it has to be pulled out of either sol_incomp or sol_comp. + * We allow connections to queue up based on current queue + * lengths and limit on number of queued connections for this + * socket. + */ + struct { + /* (e) queue of partial unaccepted connections */ + struct accept_queue sol_incomp; + /* (e) queue of complete unaccepted connections */ + struct accept_queue sol_comp; + u_int sol_qlen; /* (e) sol_comp length */ + u_int sol_incqlen; /* (e) sol_incomp length */ + u_int sol_qlimit; /* (e) queue limit */ + + /* accept_filter(9) optional data */ + struct accept_filter *sol_accept_filter; + void *sol_accept_filter_arg; /* saved filter args */ + char *sol_accept_filter_str; /* saved user args */ + + /* Optional upcall, for kernel socket. */ + so_upcall_t *sol_upcall; /* (e) */ + void *sol_upcallarg; /* (e) */ + + /* Socket buffer parameters, to be copied to + * dataflow sockets, accepted from this one. */ + int sol_sbrcv_lowat; + int sol_sbsnd_lowat; + u_int sol_sbrcv_hiwat; + u_int sol_sbsnd_hiwat; + short sol_sbrcv_flags; + short sol_sbsnd_flags; + sbintime_t sol_sbrcv_timeo; + sbintime_t sol_sbsnd_timeo; + }; + }; }; -/* - * Global accept mutex to serialize access to accept queues and - * fields associated with multiple sockets. This allows us to - * avoid defining a lock order between listen and accept sockets - * until such time as it proves to be a good idea. - */ -extern struct mtx accept_mtx; -#define ACCEPT_LOCK_ASSERT() mtx_assert(&accept_mtx, MA_OWNED) -#define ACCEPT_UNLOCK_ASSERT() mtx_assert(&accept_mtx, MA_NOTOWNED) -#define ACCEPT_LOCK() mtx_lock(&accept_mtx) -#define ACCEPT_UNLOCK() mtx_unlock(&accept_mtx) +#define SOCK_MTX(so) &(so)->so_lock +#define SOCK_LOCK(so) mtx_lock(&(so)->so_lock) +#define SOCK_OWNED(so) mtx_owned(&(so)->so_lock) +#define SOCK_UNLOCK(so) mtx_unlock(&(so)->so_lock) +#define SOCK_LOCK_ASSERT(so) mtx_assert(&(so)->so_lock, MA_OWNED) +#define SOCK_UNLOCK_ASSERT(so) mtx_assert(&(so)->so_lock, MA_NOTOWNED) -/* - * Per-socket mutex: we reuse the receive socket buffer mutex for space - * efficiency. This decision should probably be revisited as we optimize - * locking for the socket code. - */ -#define SOCK_MTX(_so) SOCKBUF_MTX(&(_so)->so_rcv) -#define SOCK_LOCK(_so) SOCKBUF_LOCK(&(_so)->so_rcv) -#define SOCK_OWNED(_so) SOCKBUF_OWNED(&(_so)->so_rcv) -#define SOCK_UNLOCK(_so) SOCKBUF_UNLOCK(&(_so)->so_rcv) -#define SOCK_LOCK_ASSERT(_so) SOCKBUF_LOCK_ASSERT(&(_so)->so_rcv) +#define SOLISTENING(sol) (((sol)->so_options & SO_ACCEPTCONN) != 0) +#define SOLISTEN_LOCK(sol) do { \ + mtx_lock(&(sol)->so_lock); \ + KASSERT(SOLISTENING(sol), \ + ("%s: %p not listening", __func__, (sol))); \ +} while (0) +#define SOLISTEN_TRYLOCK(sol) mtx_trylock(&(sol)->so_lock) +#define SOLISTEN_UNLOCK(sol) do { \ + KASSERT(SOLISTENING(sol), \ + ("%s: %p not listening", __func__, (sol))); \ + mtx_unlock(&(sol)->so_lock); \ +} while (0) +#define SOLISTEN_LOCK_ASSERT(sol) do { \ + mtx_assert(&(sol)->so_lock, MA_OWNED); \ + KASSERT(SOLISTENING(sol), \ + ("%s: %p not listening", __func__, (sol))); \ +} while (0) /* - * Socket state bits stored in so_qstate. - */ -#define SQ_INCOMP 0x0800 /* unaccepted, incomplete connection */ -#define SQ_COMP 0x1000 /* unaccepted, complete connection */ - -/* * Externalized form of struct socket used by the sysctl(3) interface. */ struct xsocket { size_t xso_len; /* length of this structure */ struct socket *xso_so; /* makes a convenient handle sometimes */ short so_type; short so_options; short so_linger; short so_state; caddr_t so_pcb; /* another convenient handle */ int xso_protocol; int xso_family; u_int so_qlen; u_int so_incqlen; u_int so_qlimit; short so_timeo; u_short so_error; pid_t so_pgid; u_long so_oobmark; struct xsockbuf so_rcv, so_snd; uid_t so_uid; /* XXX */ }; #ifdef _KERNEL /* * Macros for sockets and socket buffering. */ /* * Flags to sblock(). */ #define SBL_WAIT 0x00000001 /* Wait if not immediately available. */ #define SBL_NOINTR 0x00000002 /* Force non-interruptible sleep. */ #define SBL_VALID (SBL_WAIT | SBL_NOINTR) /* * Do we need to notify the other side when I/O is possible? */ #define sb_notify(sb) (((sb)->sb_flags & (SB_WAIT | SB_SEL | SB_ASYNC | \ SB_UPCALL | SB_AIO | SB_KNOTE)) != 0) /* do we have to send all at once on a socket? */ #define sosendallatonce(so) \ ((so)->so_proto->pr_flags & PR_ATOMIC) /* can we read something from so? */ #define soreadabledata(so) \ - (sbavail(&(so)->so_rcv) >= (so)->so_rcv.sb_lowat || \ - !TAILQ_EMPTY(&(so)->so_comp) || (so)->so_error) + (sbavail(&(so)->so_rcv) >= (so)->so_rcv.sb_lowat || (so)->so_error) #define soreadable(so) \ (soreadabledata(so) || ((so)->so_rcv.sb_state & SBS_CANTRCVMORE)) /* can we write something to so? */ #define sowriteable(so) \ ((sbspace(&(so)->so_snd) >= (so)->so_snd.sb_lowat && \ (((so)->so_state&SS_ISCONNECTED) || \ ((so)->so_proto->pr_flags&PR_CONNREQUIRED)==0)) || \ ((so)->so_snd.sb_state & SBS_CANTSENDMORE) || \ (so)->so_error) /* - * soref()/sorele() ref-count the socket structure. Note that you must - * still explicitly close the socket, but the last ref count will free - * the structure. + * soref()/sorele() ref-count the socket structure. + * soref() may be called without owning socket lock, but in that case a + * caller must own something that holds socket, and so_count must be not 0. + * Note that you must still explicitly close the socket, but the last ref + * count will free the structure. */ -#define soref(so) do { \ - SOCK_LOCK_ASSERT(so); \ - ++(so)->so_count; \ -} while (0) - +#define soref(so) refcount_acquire(&(so)->so_count) #define sorele(so) do { \ - ACCEPT_LOCK_ASSERT(); \ SOCK_LOCK_ASSERT(so); \ - if ((so)->so_count <= 0) \ - panic("sorele"); \ - if (--(so)->so_count == 0) \ + if (refcount_release(&(so)->so_count)) \ sofree(so); \ - else { \ + else \ SOCK_UNLOCK(so); \ - ACCEPT_UNLOCK(); \ - } \ } while (0) /* * In sorwakeup() and sowwakeup(), acquire the socket buffer lock to * avoid a non-atomic test-and-wakeup. However, sowakeup is * responsible for releasing the lock if it is called. We unlock only * if we don't call into sowakeup. If any code is introduced that * directly invokes the underlying sowakeup() primitives, it must * maintain the same semantics. */ #define sorwakeup_locked(so) do { \ SOCKBUF_LOCK_ASSERT(&(so)->so_rcv); \ if (sb_notify(&(so)->so_rcv)) \ sowakeup((so), &(so)->so_rcv); \ else \ SOCKBUF_UNLOCK(&(so)->so_rcv); \ } while (0) #define sorwakeup(so) do { \ SOCKBUF_LOCK(&(so)->so_rcv); \ sorwakeup_locked(so); \ } while (0) #define sowwakeup_locked(so) do { \ SOCKBUF_LOCK_ASSERT(&(so)->so_snd); \ if (sb_notify(&(so)->so_snd)) \ sowakeup((so), &(so)->so_snd); \ else \ SOCKBUF_UNLOCK(&(so)->so_snd); \ } while (0) #define sowwakeup(so) do { \ SOCKBUF_LOCK(&(so)->so_snd); \ sowwakeup_locked(so); \ } while (0) struct accept_filter { char accf_name[16]; int (*accf_callback) (struct socket *so, void *arg, int waitflag); void * (*accf_create) (struct socket *so, char *arg); void (*accf_destroy) (struct socket *so); SLIST_ENTRY(accept_filter) accf_next; }; #ifdef MALLOC_DECLARE MALLOC_DECLARE(M_ACCF); MALLOC_DECLARE(M_PCB); MALLOC_DECLARE(M_SONAME); #endif /* * Socket specific helper hook point identifiers * Do not leave holes in the sequence, hook registration is a loop. */ #define HHOOK_SOCKET_OPT 0 #define HHOOK_SOCKET_CREATE 1 #define HHOOK_SOCKET_RCV 2 #define HHOOK_SOCKET_SND 3 #define HHOOK_FILT_SOREAD 4 #define HHOOK_FILT_SOWRITE 5 #define HHOOK_SOCKET_CLOSE 6 #define HHOOK_SOCKET_LAST HHOOK_SOCKET_CLOSE struct socket_hhook_data { struct socket *so; struct mbuf *m; void *hctx; /* hook point specific data*/ int status; }; extern int maxsockets; extern u_long sb_max; extern so_gen_t so_gencnt; struct file; struct filecaps; struct filedesc; struct mbuf; struct sockaddr; struct ucred; struct uio; /* 'which' values for socket upcalls. */ #define SO_RCV 1 #define SO_SND 2 /* Return values for socket upcalls. */ #define SU_OK 0 #define SU_ISCONNECTED 1 /* * From uipc_socket and friends */ int getsockaddr(struct sockaddr **namp, caddr_t uaddr, size_t len); int getsock_cap(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp, u_int *fflagp, struct filecaps *havecaps); void soabort(struct socket *so); int soaccept(struct socket *so, struct sockaddr **nam); void soaio_enqueue(struct task *task); void soaio_rcv(void *context, int pending); void soaio_snd(void *context, int pending); int socheckuid(struct socket *so, uid_t uid); int sobind(struct socket *so, struct sockaddr *nam, struct thread *td); int sobindat(int fd, struct socket *so, struct sockaddr *nam, struct thread *td); int soclose(struct socket *so); int soconnect(struct socket *so, struct sockaddr *nam, struct thread *td); int soconnectat(int fd, struct socket *so, struct sockaddr *nam, struct thread *td); int soconnect2(struct socket *so1, struct socket *so2); int socreate(int dom, struct socket **aso, int type, int proto, struct ucred *cred, struct thread *td); int sodisconnect(struct socket *so); struct sockaddr *sodupsockaddr(const struct sockaddr *sa, int mflags); void sofree(struct socket *so); void sohasoutofband(struct socket *so); int solisten(struct socket *so, int backlog, struct thread *td); void solisten_proto(struct socket *so, int backlog); int solisten_proto_check(struct socket *so); +int solisten_dequeue(struct socket *, struct socket **, int); struct socket * sonewconn(struct socket *head, int connstatus); - - +struct socket * + sopeeloff(struct socket *); int sopoll(struct socket *so, int events, struct ucred *active_cred, struct thread *td); int sopoll_generic(struct socket *so, int events, struct ucred *active_cred, struct thread *td); int soreceive(struct socket *so, struct sockaddr **paddr, struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp); int soreceive_stream(struct socket *so, struct sockaddr **paddr, struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp); int soreceive_dgram(struct socket *so, struct sockaddr **paddr, struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp); int soreceive_generic(struct socket *so, struct sockaddr **paddr, struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp); int soreserve(struct socket *so, u_long sndcc, u_long rcvcc); void sorflush(struct socket *so); int sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, struct mbuf *top, struct mbuf *control, int flags, struct thread *td); int sosend_dgram(struct socket *so, struct sockaddr *addr, struct uio *uio, struct mbuf *top, struct mbuf *control, int flags, struct thread *td); int sosend_generic(struct socket *so, struct sockaddr *addr, struct uio *uio, struct mbuf *top, struct mbuf *control, int flags, struct thread *td); int soshutdown(struct socket *so, int how); void sotoxsocket(struct socket *so, struct xsocket *xso); void soupcall_clear(struct socket *, int); void soupcall_set(struct socket *, int, so_upcall_t, void *); +void solisten_upcall_set(struct socket *, so_upcall_t, void *); void sowakeup(struct socket *so, struct sockbuf *sb); void sowakeup_aio(struct socket *so, struct sockbuf *sb); +void solisten_wakeup(struct socket *); int selsocket(struct socket *so, int events, struct timeval *tv, struct thread *td); /* * Accept filter functions (duh). */ int accept_filt_add(struct accept_filter *filt); int accept_filt_del(char *name); struct accept_filter *accept_filt_get(char *name); #ifdef ACCEPT_FILTER_MOD #ifdef SYSCTL_DECL SYSCTL_DECL(_net_inet_accf); #endif int accept_filt_generic_mod_event(module_t mod, int event, void *data); #endif #endif /* _KERNEL */ #endif /* !_SYS_SOCKETVAR_H_ */ Index: head/usr.bin/netstat/inet.c =================================================================== --- head/usr.bin/netstat/inet.c (revision 319721) +++ head/usr.bin/netstat/inet.c (revision 319722) @@ -1,1385 +1,1388 @@ /*- * Copyright (c) 1983, 1988, 1993, 1995 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #if 0 #ifndef lint static char sccsid[] = "@(#)inet.c 8.5 (Berkeley) 5/24/95"; #endif /* not lint */ #endif #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #endif /* INET6 */ #include #include #include #include #include #include #include #include #include #define TCPSTATES #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "netstat.h" #include "nl_defs.h" void inetprint(const char *, struct in_addr *, int, const char *, int, const int); #ifdef INET6 static int udp_done, tcp_done, sdp_done; #endif /* INET6 */ static int pcblist_sysctl(int proto, const char *name, char **bufp) { const char *mibvar; char *buf; size_t len; switch (proto) { case IPPROTO_TCP: mibvar = "net.inet.tcp.pcblist"; break; case IPPROTO_UDP: mibvar = "net.inet.udp.pcblist"; break; case IPPROTO_DIVERT: mibvar = "net.inet.divert.pcblist"; break; default: mibvar = "net.inet.raw.pcblist"; break; } if (strncmp(name, "sdp", 3) == 0) mibvar = "net.inet.sdp.pcblist"; len = 0; if (sysctlbyname(mibvar, 0, &len, 0, 0) < 0) { if (errno != ENOENT) xo_warn("sysctl: %s", mibvar); return (0); } if ((buf = malloc(len)) == NULL) { xo_warnx("malloc %lu bytes", (u_long)len); return (0); } if (sysctlbyname(mibvar, buf, &len, 0, 0) < 0) { xo_warn("sysctl: %s", mibvar); free(buf); return (0); } *bufp = buf; return (1); } /* * Copied directly from uipc_socket2.c. We leave out some fields that are in * nested structures that aren't used to avoid extra work. */ static void sbtoxsockbuf(struct sockbuf *sb, struct xsockbuf *xsb) { xsb->sb_cc = sb->sb_ccc; xsb->sb_hiwat = sb->sb_hiwat; xsb->sb_mbcnt = sb->sb_mbcnt; xsb->sb_mcnt = sb->sb_mcnt; xsb->sb_ccnt = sb->sb_ccnt; xsb->sb_mbmax = sb->sb_mbmax; xsb->sb_lowat = sb->sb_lowat; xsb->sb_flags = sb->sb_flags; xsb->sb_timeo = sb->sb_timeo; } int sotoxsocket(struct socket *so, struct xsocket *xso) { struct protosw proto; struct domain domain; bzero(xso, sizeof *xso); xso->xso_len = sizeof *xso; xso->xso_so = so; xso->so_type = so->so_type; xso->so_options = so->so_options; xso->so_linger = so->so_linger; xso->so_state = so->so_state; xso->so_pcb = so->so_pcb; if (kread((uintptr_t)so->so_proto, &proto, sizeof(proto)) != 0) return (-1); xso->xso_protocol = proto.pr_protocol; if (kread((uintptr_t)proto.pr_domain, &domain, sizeof(domain)) != 0) return (-1); xso->xso_family = domain.dom_family; - xso->so_qlen = so->so_qlen; - xso->so_incqlen = so->so_incqlen; - xso->so_qlimit = so->so_qlimit; xso->so_timeo = so->so_timeo; xso->so_error = so->so_error; - xso->so_oobmark = so->so_oobmark; - sbtoxsockbuf(&so->so_snd, &xso->so_snd); - sbtoxsockbuf(&so->so_rcv, &xso->so_rcv); + if (SOLISTENING(so)) { + xso->so_qlen = so->sol_qlen; + xso->so_incqlen = so->sol_incqlen; + xso->so_qlimit = so->sol_qlimit; + } else { + sbtoxsockbuf(&so->so_snd, &xso->so_snd); + sbtoxsockbuf(&so->so_rcv, &xso->so_rcv); + xso->so_oobmark = so->so_oobmark; + } return (0); } /* * Print a summary of connections related to an Internet * protocol. For TCP, also give state of connection. * Listening processes (aflag) are suppressed unless the * -a (all) flag is specified. */ void protopr(u_long off, const char *name, int af1, int proto) { static int first = 1; int istcp; char *buf; const char *vchar; struct xtcpcb *tp; struct xinpcb *inp; struct xinpgen *xig, *oxig; struct xsocket *so; istcp = 0; switch (proto) { case IPPROTO_TCP: #ifdef INET6 if (strncmp(name, "sdp", 3) != 0) { if (tcp_done != 0) return; else tcp_done = 1; } else { if (sdp_done != 0) return; else sdp_done = 1; } #endif istcp = 1; break; case IPPROTO_UDP: #ifdef INET6 if (udp_done != 0) return; else udp_done = 1; #endif break; } if (!pcblist_sysctl(proto, name, &buf)) return; oxig = xig = (struct xinpgen *)buf; for (xig = (struct xinpgen *)((char *)xig + xig->xig_len); xig->xig_len > sizeof(struct xinpgen); xig = (struct xinpgen *)((char *)xig + xig->xig_len)) { if (istcp) { tp = (struct xtcpcb *)xig; inp = &tp->xt_inp; } else { inp = (struct xinpcb *)xig; } so = &inp->xi_socket; /* Ignore sockets for protocols other than the desired one. */ if (so->xso_protocol != proto) continue; /* Ignore PCBs which were freed during copyout. */ if (inp->inp_gencnt > oxig->xig_gen) continue; if ((af1 == AF_INET && (inp->inp_vflag & INP_IPV4) == 0) #ifdef INET6 || (af1 == AF_INET6 && (inp->inp_vflag & INP_IPV6) == 0) #endif /* INET6 */ || (af1 == AF_UNSPEC && ((inp->inp_vflag & INP_IPV4) == 0 #ifdef INET6 && (inp->inp_vflag & INP_IPV6) == 0 #endif /* INET6 */ )) ) continue; if (!aflag && ( (istcp && tp->t_state == TCPS_LISTEN) || (af1 == AF_INET && inet_lnaof(inp->inp_laddr) == INADDR_ANY) #ifdef INET6 || (af1 == AF_INET6 && IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) #endif /* INET6 */ || (af1 == AF_UNSPEC && (((inp->inp_vflag & INP_IPV4) != 0 && inet_lnaof(inp->inp_laddr) == INADDR_ANY) #ifdef INET6 || ((inp->inp_vflag & INP_IPV6) != 0 && IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) #endif )) )) continue; if (first) { if (!Lflag) { xo_emit("Active Internet connections"); if (aflag) xo_emit(" (including servers)"); } else xo_emit( "Current listen queue sizes (qlen/incqlen/maxqlen)"); xo_emit("\n"); if (Aflag) xo_emit("{T:/%-*s} ", 2 * (int)sizeof(void *), "Tcpcb"); if (Lflag) xo_emit((Aflag && !Wflag) ? "{T:/%-5.5s} {T:/%-32.32s} {T:/%-18.18s}" : ((!Wflag || af1 == AF_INET) ? "{T:/%-5.5s} {T:/%-32.32s} {T:/%-22.22s}" : "{T:/%-5.5s} {T:/%-32.32s} {T:/%-45.45s}"), "Proto", "Listen", "Local Address"); else if (Tflag) xo_emit((Aflag && !Wflag) ? "{T:/%-5.5s} {T:/%-6.6s} {T:/%-6.6s} {T:/%-6.6s} {T:/%-18.18s} {T:/%s}" : ((!Wflag || af1 == AF_INET) ? "{T:/%-5.5s} {T:/%-6.6s} {T:/%-6.6s} {T:/%-6.6s} {T:/%-22.22s} {T:/%s}" : "{T:/%-5.5s} {T:/%-6.6s} {T:/%-6.6s} {T:/%-6.6s} {T:/%-45.45s} {T:/%s}"), "Proto", "Rexmit", "OOORcv", "0-win", "Local Address", "Foreign Address"); else { xo_emit((Aflag && !Wflag) ? "{T:/%-5.5s} {T:/%-6.6s} {T:/%-6.6s} {T:/%-18.18s} {T:/%-18.18s}" : ((!Wflag || af1 == AF_INET) ? "{T:/%-5.5s} {T:/%-6.6s} {T:/%-6.6s} {T:/%-22.22s} {T:/%-22.22s}" : "{T:/%-5.5s} {T:/%-6.6s} {T:/%-6.6s} {T:/%-45.45s} {T:/%-45.45s}"), "Proto", "Recv-Q", "Send-Q", "Local Address", "Foreign Address"); if (!xflag && !Rflag) xo_emit(" (state)"); } if (xflag) { xo_emit(" {T:/%-6.6s} {T:/%-6.6s} {T:/%-6.6s} " "{T:/%-6.6s} {T:/%-6.6s} {T:/%-6.6s} " "{T:/%-6.6s} {T:/%-6.6s} {T:/%-6.6s} " "{T:/%-6.6s} {T:/%-6.6s} {T:/%-6.6s}", "R-MBUF", "S-MBUF", "R-CLUS", "S-CLUS", "R-HIWA", "S-HIWA", "R-LOWA", "S-LOWA", "R-BCNT", "S-BCNT", "R-BMAX", "S-BMAX"); xo_emit(" {T:/%7.7s} {T:/%7.7s} {T:/%7.7s} " "{T:/%7.7s} {T:/%7.7s} {T:/%7.7s}", "rexmt", "persist", "keep", "2msl", "delack", "rcvtime"); } else if (Rflag) { xo_emit(" {T:/%8.8s} {T:/%5.5s}", "flowid", "ftype"); } xo_emit("\n"); first = 0; } if (Lflag && so->so_qlimit == 0) continue; xo_open_instance("socket"); if (Aflag) { if (istcp) xo_emit("{q:address/%*lx} ", 2 * (int)sizeof(void *), (u_long)inp->inp_ppcb); else xo_emit("{q:address/%*lx} ", 2 * (int)sizeof(void *), (u_long)so->so_pcb); } #ifdef INET6 if ((inp->inp_vflag & INP_IPV6) != 0) vchar = ((inp->inp_vflag & INP_IPV4) != 0) ? "46" : "6"; else #endif vchar = ((inp->inp_vflag & INP_IPV4) != 0) ? "4" : ""; if (istcp && (tp->t_flags & TF_TOE) != 0) xo_emit("{:protocol/%-3.3s%-2.2s/%s%s} ", "toe", vchar); else xo_emit("{:protocol/%-3.3s%-2.2s/%s%s} ", name, vchar); if (Lflag) { char buf1[33]; snprintf(buf1, sizeof buf1, "%u/%u/%u", so->so_qlen, so->so_incqlen, so->so_qlimit); xo_emit("{:listen-queue-sizes/%-32.32s} ", buf1); } else if (Tflag) { if (istcp) xo_emit("{:sent-retransmit-packets/%6u} " "{:received-out-of-order-packets/%6u} " "{:sent-zero-window/%6u} ", tp->t_sndrexmitpack, tp->t_rcvoopack, tp->t_sndzerowin); else xo_emit("{P:/%21s}", ""); } else { xo_emit("{:receive-bytes-waiting/%6u} " "{:send-bytes-waiting/%6u} ", so->so_rcv.sb_cc, so->so_snd.sb_cc); } if (numeric_port) { if (inp->inp_vflag & INP_IPV4) { inetprint("local", &inp->inp_laddr, (int)inp->inp_lport, name, 1, af1); if (!Lflag) inetprint("remote", &inp->inp_faddr, (int)inp->inp_fport, name, 1, af1); } #ifdef INET6 else if (inp->inp_vflag & INP_IPV6) { inet6print("local", &inp->in6p_laddr, (int)inp->inp_lport, name, 1); if (!Lflag) inet6print("remote", &inp->in6p_faddr, (int)inp->inp_fport, name, 1); } /* else nothing printed now */ #endif /* INET6 */ } else if (inp->inp_flags & INP_ANONPORT) { if (inp->inp_vflag & INP_IPV4) { inetprint("local", &inp->inp_laddr, (int)inp->inp_lport, name, 1, af1); if (!Lflag) inetprint("remote", &inp->inp_faddr, (int)inp->inp_fport, name, 0, af1); } #ifdef INET6 else if (inp->inp_vflag & INP_IPV6) { inet6print("local", &inp->in6p_laddr, (int)inp->inp_lport, name, 1); if (!Lflag) inet6print("remote", &inp->in6p_faddr, (int)inp->inp_fport, name, 0); } /* else nothing printed now */ #endif /* INET6 */ } else { if (inp->inp_vflag & INP_IPV4) { inetprint("local", &inp->inp_laddr, (int)inp->inp_lport, name, 0, af1); if (!Lflag) inetprint("remote", &inp->inp_faddr, (int)inp->inp_fport, name, inp->inp_lport != inp->inp_fport, af1); } #ifdef INET6 else if (inp->inp_vflag & INP_IPV6) { inet6print("local", &inp->in6p_laddr, (int)inp->inp_lport, name, 0); if (!Lflag) inet6print("remote", &inp->in6p_faddr, (int)inp->inp_fport, name, inp->inp_lport != inp->inp_fport); } /* else nothing printed now */ #endif /* INET6 */ } if (xflag) { xo_emit("{:receive-mbufs/%6u} {:send-mbufs/%6u} " "{:receive-clusters/%6u} {:send-clusters/%6u} " "{:receive-high-water/%6u} {:send-high-water/%6u} " "{:receive-low-water/%6u} {:send-low-water/%6u} " "{:receive-mbuf-bytes/%6u} {:send-mbuf-bytes/%6u} " "{:receive-mbuf-bytes-max/%6u} " "{:send-mbuf-bytes-max/%6u}", so->so_rcv.sb_mcnt, so->so_snd.sb_mcnt, so->so_rcv.sb_ccnt, so->so_snd.sb_ccnt, so->so_rcv.sb_hiwat, so->so_snd.sb_hiwat, so->so_rcv.sb_lowat, so->so_snd.sb_lowat, so->so_rcv.sb_mbcnt, so->so_snd.sb_mbcnt, so->so_rcv.sb_mbmax, so->so_snd.sb_mbmax); if (istcp) xo_emit(" {:retransmit-timer/%4d.%02d} " "{:persist-timer/%4d.%02d} " "{:keepalive-timer/%4d.%02d} " "{:msl2-timer/%4d.%02d} " "{:delay-ack-timer/%4d.%02d} " "{:inactivity-timer/%4d.%02d}", tp->tt_rexmt / 1000, (tp->tt_rexmt % 1000) / 10, tp->tt_persist / 1000, (tp->tt_persist % 1000) / 10, tp->tt_keep / 1000, (tp->tt_keep % 1000) / 10, tp->tt_2msl / 1000, (tp->tt_2msl % 1000) / 10, tp->tt_delack / 1000, (tp->tt_delack % 1000) / 10, tp->t_rcvtime / 1000, (tp->t_rcvtime % 1000) / 10); } if (istcp && !Lflag && !xflag && !Tflag && !Rflag) { if (tp->t_state < 0 || tp->t_state >= TCP_NSTATES) xo_emit("{:tcp-state/%d}", tp->t_state); else { xo_emit("{:tcp-state/%s}", tcpstates[tp->t_state]); #if defined(TF_NEEDSYN) && defined(TF_NEEDFIN) /* Show T/TCP `hidden state' */ if (tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN)) xo_emit("{:need-syn-or-fin/*}"); #endif /* defined(TF_NEEDSYN) && defined(TF_NEEDFIN) */ } } if (Rflag) { /* XXX: is this right Alfred */ xo_emit(" {:flow-id/%08x} {:flow-type/%5d}", inp->inp_flowid, inp->inp_flowtype); } xo_emit("\n"); xo_close_instance("socket"); } if (xig != oxig && xig->xig_gen != oxig->xig_gen) { if (oxig->xig_count > xig->xig_count) { xo_emit("Some {d:lost/%s} sockets may have been " "deleted.\n", name); } else if (oxig->xig_count < xig->xig_count) { xo_emit("Some {d:created/%s} sockets may have been " "created.\n", name); } else { xo_emit("Some {d:changed/%s} sockets may have been " "created or deleted.\n", name); } } free(buf); } /* * Dump TCP statistics structure. */ void tcp_stats(u_long off, const char *name, int af1 __unused, int proto __unused) { struct tcpstat tcpstat; uint64_t tcps_states[TCP_NSTATES]; #ifdef INET6 if (tcp_done != 0) return; else tcp_done = 1; #endif if (fetch_stats("net.inet.tcp.stats", off, &tcpstat, sizeof(tcpstat), kread_counters) != 0) return; if (fetch_stats_ro("net.inet.tcp.states", nl[N_TCPS_STATES].n_value, &tcps_states, sizeof(tcps_states), kread_counters) != 0) return; xo_open_container("tcp"); xo_emit("{T:/%s}:\n", name); #define p(f, m) if (tcpstat.f || sflag <= 1) \ xo_emit(m, (uintmax_t )tcpstat.f, plural(tcpstat.f)) #define p1a(f, m) if (tcpstat.f || sflag <= 1) \ xo_emit(m, (uintmax_t )tcpstat.f) #define p2(f1, f2, m) if (tcpstat.f1 || tcpstat.f2 || sflag <= 1) \ xo_emit(m, (uintmax_t )tcpstat.f1, plural(tcpstat.f1), \ (uintmax_t )tcpstat.f2, plural(tcpstat.f2)) #define p2a(f1, f2, m) if (tcpstat.f1 || tcpstat.f2 || sflag <= 1) \ xo_emit(m, (uintmax_t )tcpstat.f1, plural(tcpstat.f1), \ (uintmax_t )tcpstat.f2) #define p3(f, m) if (tcpstat.f || sflag <= 1) \ xo_emit(m, (uintmax_t )tcpstat.f, pluralies(tcpstat.f)) p(tcps_sndtotal, "\t{:sent-packets/%ju} {N:/packet%s sent}\n"); p2(tcps_sndpack,tcps_sndbyte, "\t\t{:sent-data-packets/%ju} " "{N:/data packet%s} ({:sent-data-bytes/%ju} {N:/byte%s})\n"); p2(tcps_sndrexmitpack, tcps_sndrexmitbyte, "\t\t" "{:sent-retransmitted-packets/%ju} {N:/data packet%s} " "({:sent-retransmitted-bytes/%ju} {N:/byte%s}) " "{N:retransmitted}\n"); p(tcps_sndrexmitbad, "\t\t" "{:sent-unnecessary-retransmitted-packets/%ju} " "{N:/data packet%s unnecessarily retransmitted}\n"); p(tcps_mturesent, "\t\t{:sent-resends-by-mtu-discovery/%ju} " "{N:/resend%s initiated by MTU discovery}\n"); p2a(tcps_sndacks, tcps_delack, "\t\t{:sent-ack-only-packets/%ju} " "{N:/ack-only packet%s/} ({:sent-packets-delayed/%ju} " "{N:delayed})\n"); p(tcps_sndurg, "\t\t{:sent-urg-only-packets/%ju} " "{N:/URG only packet%s}\n"); p(tcps_sndprobe, "\t\t{:sent-window-probe-packets/%ju} " "{N:/window probe packet%s}\n"); p(tcps_sndwinup, "\t\t{:sent-window-update-packets/%ju} " "{N:/window update packet%s}\n"); p(tcps_sndctrl, "\t\t{:sent-control-packets/%ju} " "{N:/control packet%s}\n"); p(tcps_rcvtotal, "\t{:received-packets/%ju} " "{N:/packet%s received}\n"); p2(tcps_rcvackpack, tcps_rcvackbyte, "\t\t" "{:received-ack-packets/%ju} {N:/ack%s} " "{N:(for} {:received-ack-bytes/%ju} {N:/byte%s})\n"); p(tcps_rcvdupack, "\t\t{:received-duplicate-acks/%ju} " "{N:/duplicate ack%s}\n"); p(tcps_rcvacktoomuch, "\t\t{:received-acks-for-unsent-data/%ju} " "{N:/ack%s for unsent data}\n"); p2(tcps_rcvpack, tcps_rcvbyte, "\t\t" "{:received-in-sequence-packets/%ju} {N:/packet%s} " "({:received-in-sequence-bytes/%ju} {N:/byte%s}) " "{N:received in-sequence}\n"); p2(tcps_rcvduppack, tcps_rcvdupbyte, "\t\t" "{:received-completely-duplicate-packets/%ju} " "{N:/completely duplicate packet%s} " "({:received-completely-duplicate-bytes/%ju} {N:/byte%s})\n"); p(tcps_pawsdrop, "\t\t{:received-old-duplicate-packets/%ju} " "{N:/old duplicate packet%s}\n"); p2(tcps_rcvpartduppack, tcps_rcvpartdupbyte, "\t\t" "{:received-some-duplicate-packets/%ju} " "{N:/packet%s with some dup. data} " "({:received-some-duplicate-bytes/%ju} {N:/byte%s duped/})\n"); p2(tcps_rcvoopack, tcps_rcvoobyte, "\t\t{:received-out-of-order/%ju} " "{N:/out-of-order packet%s} " "({:received-out-of-order-bytes/%ju} {N:/byte%s})\n"); p2(tcps_rcvpackafterwin, tcps_rcvbyteafterwin, "\t\t" "{:received-after-window-packets/%ju} {N:/packet%s} " "({:received-after-window-bytes/%ju} {N:/byte%s}) " "{N:of data after window}\n"); p(tcps_rcvwinprobe, "\t\t{:received-window-probes/%ju} " "{N:/window probe%s}\n"); p(tcps_rcvwinupd, "\t\t{:receive-window-update-packets/%ju} " "{N:/window update packet%s}\n"); p(tcps_rcvafterclose, "\t\t{:received-after-close-packets/%ju} " "{N:/packet%s received after close}\n"); p(tcps_rcvbadsum, "\t\t{:discard-bad-checksum/%ju} " "{N:/discarded for bad checksum%s}\n"); p(tcps_rcvbadoff, "\t\t{:discard-bad-header-offset/%ju} " "{N:/discarded for bad header offset field%s}\n"); p1a(tcps_rcvshort, "\t\t{:discard-too-short/%ju} " "{N:discarded because packet too short}\n"); p1a(tcps_rcvmemdrop, "\t\t{:discard-memory-problems/%ju} " "{N:discarded due to memory problems}\n"); p(tcps_connattempt, "\t{:connection-requests/%ju} " "{N:/connection request%s}\n"); p(tcps_accepts, "\t{:connections-accepts/%ju} " "{N:/connection accept%s}\n"); p(tcps_badsyn, "\t{:bad-connection-attempts/%ju} " "{N:/bad connection attempt%s}\n"); p(tcps_listendrop, "\t{:listen-queue-overflows/%ju} " "{N:/listen queue overflow%s}\n"); p(tcps_badrst, "\t{:ignored-in-window-resets/%ju} " "{N:/ignored RSTs in the window%s}\n"); p(tcps_connects, "\t{:connections-established/%ju} " "{N:/connection%s established (including accepts)}\n"); p(tcps_usedrtt, "\t\t{:connections-hostcache-rtt/%ju} " "{N:/time%s used RTT from hostcache}\n"); p(tcps_usedrttvar, "\t\t{:connections-hostcache-rttvar/%ju} " "{N:/time%s used RTT variance from hostcache}\n"); p(tcps_usedssthresh, "\t\t{:connections-hostcache-ssthresh/%ju} " "{N:/time%s used slow-start threshold from hostcache}\n"); p2(tcps_closed, tcps_drops, "\t{:connections-closed/%ju} " "{N:/connection%s closed (including} " "{:connection-drops/%ju} {N:/drop%s})\n"); p(tcps_cachedrtt, "\t\t{:connections-updated-rtt-on-close/%ju} " "{N:/connection%s updated cached RTT on close}\n"); p(tcps_cachedrttvar, "\t\t" "{:connections-updated-variance-on-close/%ju} " "{N:/connection%s updated cached RTT variance on close}\n"); p(tcps_cachedssthresh, "\t\t" "{:connections-updated-ssthresh-on-close/%ju} " "{N:/connection%s updated cached ssthresh on close}\n"); p(tcps_conndrops, "\t{:embryonic-connections-dropped/%ju} " "{N:/embryonic connection%s dropped}\n"); p2(tcps_rttupdated, tcps_segstimed, "\t{:segments-updated-rtt/%ju} " "{N:/segment%s updated rtt (of} " "{:segment-update-attempts/%ju} {N:/attempt%s})\n"); p(tcps_rexmttimeo, "\t{:retransmit-timeouts/%ju} " "{N:/retransmit timeout%s}\n"); p(tcps_timeoutdrop, "\t\t" "{:connections-dropped-by-retransmit-timeout/%ju} " "{N:/connection%s dropped by rexmit timeout}\n"); p(tcps_persisttimeo, "\t{:persist-timeout/%ju} " "{N:/persist timeout%s}\n"); p(tcps_persistdrop, "\t\t" "{:connections-dropped-by-persist-timeout/%ju} " "{N:/connection%s dropped by persist timeout}\n"); p(tcps_finwait2_drops, "\t" "{:connections-dropped-by-finwait2-timeout/%ju} " "{N:/Connection%s (fin_wait_2) dropped because of timeout}\n"); p(tcps_keeptimeo, "\t{:keepalive-timeout/%ju} " "{N:/keepalive timeout%s}\n"); p(tcps_keepprobe, "\t\t{:keepalive-probes/%ju} " "{N:/keepalive probe%s sent}\n"); p(tcps_keepdrops, "\t\t{:connections-dropped-by-keepalives/%ju} " "{N:/connection%s dropped by keepalive}\n"); p(tcps_predack, "\t{:ack-header-predictions/%ju} " "{N:/correct ACK header prediction%s}\n"); p(tcps_preddat, "\t{:data-packet-header-predictions/%ju} " "{N:/correct data packet header prediction%s}\n"); xo_open_container("syncache"); p3(tcps_sc_added, "\t{:entries-added/%ju} " "{N:/syncache entr%s added}\n"); p1a(tcps_sc_retransmitted, "\t\t{:retransmitted/%ju} " "{N:/retransmitted}\n"); p1a(tcps_sc_dupsyn, "\t\t{:duplicates/%ju} {N:/dupsyn}\n"); p1a(tcps_sc_dropped, "\t\t{:dropped/%ju} {N:/dropped}\n"); p1a(tcps_sc_completed, "\t\t{:completed/%ju} {N:/completed}\n"); p1a(tcps_sc_bucketoverflow, "\t\t{:bucket-overflow/%ju} " "{N:/bucket overflow}\n"); p1a(tcps_sc_cacheoverflow, "\t\t{:cache-overflow/%ju} " "{N:/cache overflow}\n"); p1a(tcps_sc_reset, "\t\t{:reset/%ju} {N:/reset}\n"); p1a(tcps_sc_stale, "\t\t{:stale/%ju} {N:/stale}\n"); p1a(tcps_sc_aborted, "\t\t{:aborted/%ju} {N:/aborted}\n"); p1a(tcps_sc_badack, "\t\t{:bad-ack/%ju} {N:/badack}\n"); p1a(tcps_sc_unreach, "\t\t{:unreachable/%ju} {N:/unreach}\n"); p(tcps_sc_zonefail, "\t\t{:zone-failures/%ju} {N:/zone failure%s}\n"); p(tcps_sc_sendcookie, "\t{:sent-cookies/%ju} {N:/cookie%s sent}\n"); p(tcps_sc_recvcookie, "\t{:receivd-cookies/%ju} " "{N:/cookie%s received}\n"); xo_close_container("syncache"); xo_open_container("hostcache"); p3(tcps_hc_added, "\t{:entries-added/%ju} " "{N:/hostcache entr%s added}\n"); p1a(tcps_hc_bucketoverflow, "\t\t{:buffer-overflows/%ju} " "{N:/bucket overflow}\n"); xo_close_container("hostcache"); xo_open_container("sack"); p(tcps_sack_recovery_episode, "\t{:recovery-episodes/%ju} " "{N:/SACK recovery episode%s}\n"); p(tcps_sack_rexmits, "\t{:segment-retransmits/%ju} " "{N:/segment rexmit%s in SACK recovery episodes}\n"); p(tcps_sack_rexmit_bytes, "\t{:byte-retransmits/%ju} " "{N:/byte rexmit%s in SACK recovery episodes}\n"); p(tcps_sack_rcv_blocks, "\t{:received-blocks/%ju} " "{N:/SACK option%s (SACK blocks) received}\n"); p(tcps_sack_send_blocks, "\t{:sent-option-blocks/%ju} " "{N:/SACK option%s (SACK blocks) sent}\n"); p1a(tcps_sack_sboverflow, "\t{:scoreboard-overflows/%ju} " "{N:/SACK scoreboard overflow}\n"); xo_close_container("sack"); xo_open_container("ecn"); p(tcps_ecn_ce, "\t{:ce-packets/%ju} " "{N:/packet%s with ECN CE bit set}\n"); p(tcps_ecn_ect0, "\t{:ect0-packets/%ju} " "{N:/packet%s with ECN ECT(0) bit set}\n"); p(tcps_ecn_ect1, "\t{:ect1-packets/%ju} " "{N:/packet%s with ECN ECT(1) bit set}\n"); p(tcps_ecn_shs, "\t{:handshakes/%ju} " "{N:/successful ECN handshake%s}\n"); p(tcps_ecn_rcwnd, "\t{:congestion-reductions/%ju} " "{N:/time%s ECN reduced the congestion window}\n"); xo_close_container("ecn"); xo_open_container("tcp-signature"); p(tcps_sig_rcvgoodsig, "\t{:received-good-signature/%ju} " "{N:/packet%s with matching signature received}\n"); p(tcps_sig_rcvbadsig, "\t{:received-bad-signature/%ju} " "{N:/packet%s with bad signature received}\n"); p(tcps_sig_err_buildsig, "\t{:failed-make-signature/%ju} " "{N:/time%s failed to make signature due to no SA}\n"); p(tcps_sig_err_sigopt, "\t{:no-signature-expected/%ju} " "{N:/time%s unexpected signature received}\n"); p(tcps_sig_err_nosigopt, "\t{:no-signature-provided/%ju} " "{N:/time%s no signature provided by segment}\n"); #undef p #undef p1a #undef p2 #undef p2a #undef p3 xo_close_container("tcp-signature"); xo_open_container("TCP connection count by state"); xo_emit("{T:/TCP connection count by state}:\n"); for (int i = 0; i < TCP_NSTATES; i++) { /* * XXXGL: is there a way in libxo to use %s * in the "content string" of a format * string? I failed to do that, that's why * a temporary buffer is used to construct * format string for xo_emit(). */ char fmtbuf[80]; if (sflag > 1 && tcps_states[i] == 0) continue; snprintf(fmtbuf, sizeof(fmtbuf), "\t{:%s/%%ju} " "{Np:/connection ,connections} in %s state\n", tcpstates[i], tcpstates[i]); xo_emit(fmtbuf, (uintmax_t )tcps_states[i]); } xo_close_container("TCP connection count by state"); xo_close_container("tcp"); } /* * Dump UDP statistics structure. */ void udp_stats(u_long off, const char *name, int af1 __unused, int proto __unused) { struct udpstat udpstat; uint64_t delivered; #ifdef INET6 if (udp_done != 0) return; else udp_done = 1; #endif if (fetch_stats("net.inet.udp.stats", off, &udpstat, sizeof(udpstat), kread_counters) != 0) return; xo_open_container("udp"); xo_emit("{T:/%s}:\n", name); #define p(f, m) if (udpstat.f || sflag <= 1) \ xo_emit("\t" m, (uintmax_t)udpstat.f, plural(udpstat.f)) #define p1a(f, m) if (udpstat.f || sflag <= 1) \ xo_emit("\t" m, (uintmax_t)udpstat.f) p(udps_ipackets, "{:received-datagrams/%ju} " "{N:/datagram%s received}\n"); p1a(udps_hdrops, "{:dropped-incomplete-headers/%ju} " "{N:/with incomplete header}\n"); p1a(udps_badlen, "{:dropped-bad-data-length/%ju} " "{N:/with bad data length field}\n"); p1a(udps_badsum, "{:dropped-bad-checksum/%ju} " "{N:/with bad checksum}\n"); p1a(udps_nosum, "{:dropped-no-checksum/%ju} " "{N:/with no checksum}\n"); p1a(udps_noport, "{:dropped-no-socket/%ju} " "{N:/dropped due to no socket}\n"); p(udps_noportbcast, "{:dropped-broadcast-multicast/%ju} " "{N:/broadcast\\/multicast datagram%s undelivered}\n"); p1a(udps_fullsock, "{:dropped-full-socket-buffer/%ju} " "{N:/dropped due to full socket buffers}\n"); p1a(udpps_pcbhashmiss, "{:not-for-hashed-pcb/%ju} " "{N:/not for hashed pcb}\n"); delivered = udpstat.udps_ipackets - udpstat.udps_hdrops - udpstat.udps_badlen - udpstat.udps_badsum - udpstat.udps_noport - udpstat.udps_noportbcast - udpstat.udps_fullsock; if (delivered || sflag <= 1) xo_emit("\t{:delivered-packets/%ju} {N:/delivered}\n", (uint64_t)delivered); p(udps_opackets, "{:output-packets/%ju} {N:/datagram%s output}\n"); /* the next statistic is cumulative in udps_noportbcast */ p(udps_filtermcast, "{:multicast-source-filter-matches/%ju} " "{N:/time%s multicast source filter matched}\n"); #undef p #undef p1a xo_close_container("udp"); } /* * Dump CARP statistics structure. */ void carp_stats(u_long off, const char *name, int af1 __unused, int proto __unused) { struct carpstats carpstat; if (fetch_stats("net.inet.carp.stats", off, &carpstat, sizeof(carpstat), kread_counters) != 0) return; xo_open_container(name); xo_emit("{T:/%s}:\n", name); #define p(f, m) if (carpstat.f || sflag <= 1) \ xo_emit(m, (uintmax_t)carpstat.f, plural(carpstat.f)) #define p2(f, m) if (carpstat.f || sflag <= 1) \ xo_emit(m, (uintmax_t)carpstat.f) p(carps_ipackets, "\t{:received-inet-packets/%ju} " "{N:/packet%s received (IPv4)}\n"); p(carps_ipackets6, "\t{:received-inet6-packets/%ju} " "{N:/packet%s received (IPv6)}\n"); p(carps_badttl, "\t\t{:dropped-wrong-ttl/%ju} " "{N:/packet%s discarded for wrong TTL}\n"); p(carps_hdrops, "\t\t{:dropped-short-header/%ju} " "{N:/packet%s shorter than header}\n"); p(carps_badsum, "\t\t{:dropped-bad-checksum/%ju} " "{N:/discarded for bad checksum%s}\n"); p(carps_badver, "\t\t{:dropped-bad-version/%ju} " "{N:/discarded packet%s with a bad version}\n"); p2(carps_badlen, "\t\t{:dropped-short-packet/%ju} " "{N:/discarded because packet too short}\n"); p2(carps_badauth, "\t\t{:dropped-bad-authentication/%ju} " "{N:/discarded for bad authentication}\n"); p2(carps_badvhid, "\t\t{:dropped-bad-vhid/%ju} " "{N:/discarded for bad vhid}\n"); p2(carps_badaddrs, "\t\t{:dropped-bad-address-list/%ju} " "{N:/discarded because of a bad address list}\n"); p(carps_opackets, "\t{:sent-inet-packets/%ju} " "{N:/packet%s sent (IPv4)}\n"); p(carps_opackets6, "\t{:sent-inet6-packets/%ju} " "{N:/packet%s sent (IPv6)}\n"); p2(carps_onomem, "\t\t{:send-failed-memory-error/%ju} " "{N:/send failed due to mbuf memory error}\n"); #if notyet p(carps_ostates, "\t\t{:send-state-updates/%s} " "{N:/state update%s sent}\n"); #endif #undef p #undef p2 xo_close_container(name); } /* * Dump IP statistics structure. */ void ip_stats(u_long off, const char *name, int af1 __unused, int proto __unused) { struct ipstat ipstat; if (fetch_stats("net.inet.ip.stats", off, &ipstat, sizeof(ipstat), kread_counters) != 0) return; xo_open_container(name); xo_emit("{T:/%s}:\n", name); #define p(f, m) if (ipstat.f || sflag <= 1) \ xo_emit(m, (uintmax_t )ipstat.f, plural(ipstat.f)) #define p1a(f, m) if (ipstat.f || sflag <= 1) \ xo_emit(m, (uintmax_t )ipstat.f) p(ips_total, "\t{:received-packets/%ju} " "{N:/total packet%s received}\n"); p(ips_badsum, "\t{:dropped-bad-checksum/%ju} " "{N:/bad header checksum%s}\n"); p1a(ips_toosmall, "\t{:dropped-below-minimum-size/%ju} " "{N:/with size smaller than minimum}\n"); p1a(ips_tooshort, "\t{:dropped-short-packets/%ju} " "{N:/with data size < data length}\n"); p1a(ips_toolong, "\t{:dropped-too-long/%ju} " "{N:/with ip length > max ip packet size}\n"); p1a(ips_badhlen, "\t{:dropped-short-header-length/%ju} " "{N:/with header length < data size}\n"); p1a(ips_badlen, "\t{:dropped-short-data/%ju} " "{N:/with data length < header length}\n"); p1a(ips_badoptions, "\t{:dropped-bad-options/%ju} " "{N:/with bad options}\n"); p1a(ips_badvers, "\t{:dropped-bad-version/%ju} " "{N:/with incorrect version number}\n"); p(ips_fragments, "\t{:received-fragments/%ju} " "{N:/fragment%s received}\n"); p(ips_fragdropped, "\t{:dropped-fragments/%ju} " "{N:/fragment%s dropped (dup or out of space)}\n"); p(ips_fragtimeout, "\t{:dropped-fragments-after-timeout/%ju} " "{N:/fragment%s dropped after timeout}\n"); p(ips_reassembled, "\t{:reassembled-packets/%ju} " "{N:/packet%s reassembled ok}\n"); p(ips_delivered, "\t{:received-local-packets/%ju} " "{N:/packet%s for this host}\n"); p(ips_noproto, "\t{:dropped-unknown-protocol/%ju} " "{N:/packet%s for unknown\\/unsupported protocol}\n"); p(ips_forward, "\t{:forwarded-packets/%ju} " "{N:/packet%s forwarded}"); p(ips_fastforward, " ({:fast-forwarded-packets/%ju} " "{N:/packet%s fast forwarded})"); if (ipstat.ips_forward || sflag <= 1) xo_emit("\n"); p(ips_cantforward, "\t{:packets-cannot-forward/%ju} " "{N:/packet%s not forwardable}\n"); p(ips_notmember, "\t{:received-unknown-multicast-group/%ju} " "{N:/packet%s received for unknown multicast group}\n"); p(ips_redirectsent, "\t{:redirects-sent/%ju} " "{N:/redirect%s sent}\n"); p(ips_localout, "\t{:sent-packets/%ju} " "{N:/packet%s sent from this host}\n"); p(ips_rawout, "\t{:send-packets-fabricated-header/%ju} " "{N:/packet%s sent with fabricated ip header}\n"); p(ips_odropped, "\t{:discard-no-mbufs/%ju} " "{N:/output packet%s dropped due to no bufs, etc.}\n"); p(ips_noroute, "\t{:discard-no-route/%ju} " "{N:/output packet%s discarded due to no route}\n"); p(ips_fragmented, "\t{:sent-fragments/%ju} " "{N:/output datagram%s fragmented}\n"); p(ips_ofragments, "\t{:fragments-created/%ju} " "{N:/fragment%s created}\n"); p(ips_cantfrag, "\t{:discard-cannot-fragment/%ju} " "{N:/datagram%s that can't be fragmented}\n"); p(ips_nogif, "\t{:discard-tunnel-no-gif/%ju} " "{N:/tunneling packet%s that can't find gif}\n"); p(ips_badaddr, "\t{:discard-bad-address/%ju} " "{N:/datagram%s with bad address in header}\n"); #undef p #undef p1a xo_close_container(name); } /* * Dump ARP statistics structure. */ void arp_stats(u_long off, const char *name, int af1 __unused, int proto __unused) { struct arpstat arpstat; if (fetch_stats("net.link.ether.arp.stats", off, &arpstat, sizeof(arpstat), kread_counters) != 0) return; xo_open_container(name); xo_emit("{T:/%s}:\n", name); #define p(f, m) if (arpstat.f || sflag <= 1) \ xo_emit("\t" m, (uintmax_t)arpstat.f, plural(arpstat.f)) #define p2(f, m) if (arpstat.f || sflag <= 1) \ xo_emit("\t" m, (uintmax_t)arpstat.f, pluralies(arpstat.f)) p(txrequests, "{:sent-requests/%ju} {N:/ARP request%s sent}\n"); p2(txreplies, "{:sent-replies/%ju} {N:/ARP repl%s sent}\n"); p(rxrequests, "{:received-requests/%ju} " "{N:/ARP request%s received}\n"); p2(rxreplies, "{:received-replies/%ju} " "{N:/ARP repl%s received}\n"); p(received, "{:received-packers/%ju} " "{N:/ARP packet%s received}\n"); p(dropped, "{:dropped-no-entry/%ju} " "{N:/total packet%s dropped due to no ARP entry}\n"); p(timeouts, "{:entries-timeout/%ju} " "{N:/ARP entry%s timed out}\n"); p(dupips, "{:dropped-duplicate-address/%ju} " "{N:/Duplicate IP%s seen}\n"); #undef p #undef p2 xo_close_container(name); } static const char *icmpnames[ICMP_MAXTYPE + 1] = { "echo reply", /* RFC 792 */ "#1", "#2", "destination unreachable", /* RFC 792 */ "source quench", /* RFC 792 */ "routing redirect", /* RFC 792 */ "#6", "#7", "echo", /* RFC 792 */ "router advertisement", /* RFC 1256 */ "router solicitation", /* RFC 1256 */ "time exceeded", /* RFC 792 */ "parameter problem", /* RFC 792 */ "time stamp", /* RFC 792 */ "time stamp reply", /* RFC 792 */ "information request", /* RFC 792 */ "information request reply", /* RFC 792 */ "address mask request", /* RFC 950 */ "address mask reply", /* RFC 950 */ "#19", "#20", "#21", "#22", "#23", "#24", "#25", "#26", "#27", "#28", "#29", "icmp traceroute", /* RFC 1393 */ "datagram conversion error", /* RFC 1475 */ "mobile host redirect", "IPv6 where-are-you", "IPv6 i-am-here", "mobile registration req", "mobile registration reply", "domain name request", /* RFC 1788 */ "domain name reply", /* RFC 1788 */ "icmp SKIP", "icmp photuris", /* RFC 2521 */ }; /* * Dump ICMP statistics. */ void icmp_stats(u_long off, const char *name, int af1 __unused, int proto __unused) { struct icmpstat icmpstat; size_t len; int i, first; if (fetch_stats("net.inet.icmp.stats", off, &icmpstat, sizeof(icmpstat), kread_counters) != 0) return; xo_open_container(name); xo_emit("{T:/%s}:\n", name); #define p(f, m) if (icmpstat.f || sflag <= 1) \ xo_emit(m, icmpstat.f, plural(icmpstat.f)) #define p1a(f, m) if (icmpstat.f || sflag <= 1) \ xo_emit(m, icmpstat.f) #define p2(f, m) if (icmpstat.f || sflag <= 1) \ xo_emit(m, icmpstat.f, plurales(icmpstat.f)) p(icps_error, "\t{:icmp-calls/%lu} " "{N:/call%s to icmp_error}\n"); p(icps_oldicmp, "\t{:errors-not-from-message/%lu} " "{N:/error%s not generated in response to an icmp message}\n"); for (first = 1, i = 0; i < ICMP_MAXTYPE + 1; i++) { if (icmpstat.icps_outhist[i] != 0) { if (first) { xo_open_list("output-histogram"); xo_emit("\tOutput histogram:\n"); first = 0; } xo_open_instance("output-histogram"); if (icmpnames[i] != NULL) xo_emit("\t\t{k:name/%s}: {:count/%lu}\n", icmpnames[i], icmpstat.icps_outhist[i]); else xo_emit("\t\tunknown ICMP #{k:name/%d}: " "{:count/%lu}\n", i, icmpstat.icps_outhist[i]); xo_close_instance("output-histogram"); } } if (!first) xo_close_list("output-histogram"); p(icps_badcode, "\t{:dropped-bad-code/%lu} " "{N:/message%s with bad code fields}\n"); p(icps_tooshort, "\t{:dropped-too-short/%lu} " "{N:/message%s less than the minimum length}\n"); p(icps_checksum, "\t{:dropped-bad-checksum/%lu} " "{N:/message%s with bad checksum}\n"); p(icps_badlen, "\t{:dropped-bad-length/%lu} " "{N:/message%s with bad length}\n"); p1a(icps_bmcastecho, "\t{:dropped-multicast-echo/%lu} " "{N:/multicast echo requests ignored}\n"); p1a(icps_bmcasttstamp, "\t{:dropped-multicast-timestamp/%lu} " "{N:/multicast timestamp requests ignored}\n"); for (first = 1, i = 0; i < ICMP_MAXTYPE + 1; i++) { if (icmpstat.icps_inhist[i] != 0) { if (first) { xo_open_list("input-histogram"); xo_emit("\tInput histogram:\n"); first = 0; } xo_open_instance("input-histogram"); if (icmpnames[i] != NULL) xo_emit("\t\t{k:name/%s}: {:count/%lu}\n", icmpnames[i], icmpstat.icps_inhist[i]); else xo_emit( "\t\tunknown ICMP #{k:name/%d}: {:count/%lu}\n", i, icmpstat.icps_inhist[i]); xo_close_instance("input-histogram"); } } if (!first) xo_close_list("input-histogram"); p(icps_reflect, "\t{:sent-packets/%lu} " "{N:/message response%s generated}\n"); p2(icps_badaddr, "\t{:discard-invalid-return-address/%lu} " "{N:/invalid return address%s}\n"); p(icps_noroute, "\t{:discard-no-route/%lu} " "{N:/no return route%s}\n"); #undef p #undef p1a #undef p2 if (live) { len = sizeof i; if (sysctlbyname("net.inet.icmp.maskrepl", &i, &len, NULL, 0) < 0) return; xo_emit("\tICMP address mask responses are " "{q:icmp-address-responses/%sabled}\n", i ? "en" : "dis"); } xo_close_container(name); } /* * Dump IGMP statistics structure. */ void igmp_stats(u_long off, const char *name, int af1 __unused, int proto __unused) { struct igmpstat igmpstat; if (fetch_stats("net.inet.igmp.stats", 0, &igmpstat, sizeof(igmpstat), kread) != 0) return; if (igmpstat.igps_version != IGPS_VERSION_3) { xo_warnx("%s: version mismatch (%d != %d)", __func__, igmpstat.igps_version, IGPS_VERSION_3); } if (igmpstat.igps_len != IGPS_VERSION3_LEN) { xo_warnx("%s: size mismatch (%d != %d)", __func__, igmpstat.igps_len, IGPS_VERSION3_LEN); } xo_open_container(name); xo_emit("{T:/%s}:\n", name); #define p64(f, m) if (igmpstat.f || sflag <= 1) \ xo_emit(m, (uintmax_t) igmpstat.f, plural(igmpstat.f)) #define py64(f, m) if (igmpstat.f || sflag <= 1) \ xo_emit(m, (uintmax_t) igmpstat.f, pluralies(igmpstat.f)) p64(igps_rcv_total, "\t{:received-messages/%ju} " "{N:/message%s received}\n"); p64(igps_rcv_tooshort, "\t{:dropped-too-short/%ju} " "{N:/message%s received with too few bytes}\n"); p64(igps_rcv_badttl, "\t{:dropped-wrong-ttl/%ju} " "{N:/message%s received with wrong TTL}\n"); p64(igps_rcv_badsum, "\t{:dropped-bad-checksum/%ju} " "{N:/message%s received with bad checksum}\n"); py64(igps_rcv_v1v2_queries, "\t{:received-membership-queries/%ju} " "{N:/V1\\/V2 membership quer%s received}\n"); py64(igps_rcv_v3_queries, "\t{:received-v3-membership-queries/%ju} " "{N:/V3 membership quer%s received}\n"); py64(igps_rcv_badqueries, "\t{:dropped-membership-queries/%ju} " "{N:/membership quer%s received with invalid field(s)}\n"); py64(igps_rcv_gen_queries, "\t{:received-general-queries/%ju} " "{N:/general quer%s received}\n"); py64(igps_rcv_group_queries, "\t{:received-group-queries/%ju} " "{N:/group quer%s received}\n"); py64(igps_rcv_gsr_queries, "\t{:received-group-source-queries/%ju} " "{N:/group-source quer%s received}\n"); py64(igps_drop_gsr_queries, "\t{:dropped-group-source-queries/%ju} " "{N:/group-source quer%s dropped}\n"); p64(igps_rcv_reports, "\t{:received-membership-requests/%ju} " "{N:/membership report%s received}\n"); p64(igps_rcv_badreports, "\t{:dropped-membership-reports/%ju} " "{N:/membership report%s received with invalid field(s)}\n"); p64(igps_rcv_ourreports, "\t" "{:received-membership-reports-matching/%ju} " "{N:/membership report%s received for groups to which we belong}" "\n"); p64(igps_rcv_nora, "\t{:received-v3-reports-no-router-alert/%ju} " "{N:/V3 report%s received without Router Alert}\n"); p64(igps_snd_reports, "\t{:sent-membership-reports/%ju} " "{N:/membership report%s sent}\n"); #undef p64 #undef py64 xo_close_container(name); } /* * Dump PIM statistics structure. */ void pim_stats(u_long off __unused, const char *name, int af1 __unused, int proto __unused) { struct pimstat pimstat; if (fetch_stats("net.inet.pim.stats", off, &pimstat, sizeof(pimstat), kread_counters) != 0) return; xo_open_container(name); xo_emit("{T:/%s}:\n", name); #define p(f, m) if (pimstat.f || sflag <= 1) \ xo_emit(m, (uintmax_t)pimstat.f, plural(pimstat.f)) #define py(f, m) if (pimstat.f || sflag <= 1) \ xo_emit(m, (uintmax_t)pimstat.f, pimstat.f != 1 ? "ies" : "y") p(pims_rcv_total_msgs, "\t{:received-messages/%ju} " "{N:/message%s received}\n"); p(pims_rcv_total_bytes, "\t{:received-bytes/%ju} " "{N:/byte%s received}\n"); p(pims_rcv_tooshort, "\t{:dropped-too-short/%ju} " "{N:/message%s received with too few bytes}\n"); p(pims_rcv_badsum, "\t{:dropped-bad-checksum/%ju} " "{N:/message%s received with bad checksum}\n"); p(pims_rcv_badversion, "\t{:dropped-bad-version/%ju} " "{N:/message%s received with bad version}\n"); p(pims_rcv_registers_msgs, "\t{:received-data-register-messages/%ju} " "{N:/data register message%s received}\n"); p(pims_rcv_registers_bytes, "\t{:received-data-register-bytes/%ju} " "{N:/data register byte%s received}\n"); p(pims_rcv_registers_wrongiif, "\t" "{:received-data-register-wrong-interface/%ju} " "{N:/data register message%s received on wrong iif}\n"); p(pims_rcv_badregisters, "\t{:received-bad-registers/%ju} " "{N:/bad register%s received}\n"); p(pims_snd_registers_msgs, "\t{:sent-data-register-messages/%ju} " "{N:/data register message%s sent}\n"); p(pims_snd_registers_bytes, "\t{:sent-data-register-bytes/%ju} " "{N:/data register byte%s sent}\n"); #undef p #undef py xo_close_container(name); } /* * Pretty print an Internet address (net address + port). */ void inetprint(const char *container, struct in_addr *in, int port, const char *proto, int num_port, const int af1) { struct servent *sp = 0; char line[80], *cp; int width; size_t alen, plen; if (container) xo_open_container(container); if (Wflag) snprintf(line, sizeof(line), "%s.", inetname(in)); else snprintf(line, sizeof(line), "%.*s.", (Aflag && !num_port) ? 12 : 16, inetname(in)); alen = strlen(line); cp = line + alen; if (!num_port && port) sp = getservbyport((int)port, proto); if (sp || port == 0) snprintf(cp, sizeof(line) - alen, "%.15s ", sp ? sp->s_name : "*"); else snprintf(cp, sizeof(line) - alen, "%d ", ntohs((u_short)port)); width = (Aflag && !Wflag) ? 18 : ((!Wflag || af1 == AF_INET) ? 22 : 45); if (Wflag) xo_emit("{d:target/%-*s} ", width, line); else xo_emit("{d:target/%-*.*s} ", width, width, line); plen = strlen(cp) - 1; alen--; xo_emit("{e:address/%*.*s}{e:port/%*.*s}", alen, alen, line, plen, plen, cp); if (container) xo_close_container(container); } /* * Construct an Internet address representation. * If numeric_addr has been supplied, give * numeric value, otherwise try for symbolic name. */ char * inetname(struct in_addr *inp) { char *cp; static char line[MAXHOSTNAMELEN]; struct hostent *hp; struct netent *np; cp = 0; if (!numeric_addr && inp->s_addr != INADDR_ANY) { int net = inet_netof(*inp); int lna = inet_lnaof(*inp); if (lna == INADDR_ANY) { np = getnetbyaddr(net, AF_INET); if (np) cp = np->n_name; } if (cp == NULL) { hp = gethostbyaddr((char *)inp, sizeof (*inp), AF_INET); if (hp) { cp = hp->h_name; trimdomain(cp, strlen(cp)); } } } if (inp->s_addr == INADDR_ANY) strcpy(line, "*"); else if (cp) { strlcpy(line, cp, sizeof(line)); } else { inp->s_addr = ntohl(inp->s_addr); #define C(x) ((u_int)((x) & 0xff)) snprintf(line, sizeof(line), "%u.%u.%u.%u", C(inp->s_addr >> 24), C(inp->s_addr >> 16), C(inp->s_addr >> 8), C(inp->s_addr)); } return (line); }