diff --git a/sys/dev/hyperv/hvsock/hv_sock.c b/sys/dev/hyperv/hvsock/hv_sock.c index bcc237271465..00e296fcd335 100644 --- a/sys/dev/hyperv/hvsock/hv_sock.c +++ b/sys/dev/hyperv/hvsock/hv_sock.c @@ -1,1773 +1,1773 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2020 Microsoft Corp. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "hv_sock.h" #define HVSOCK_DBG_NONE 0x0 #define HVSOCK_DBG_INFO 0x1 #define HVSOCK_DBG_ERR 0x2 #define HVSOCK_DBG_VERBOSE 0x3 SYSCTL_NODE(_net, OID_AUTO, hvsock, CTLFLAG_RD, 0, "HyperV socket"); static int hvs_dbg_level; SYSCTL_INT(_net_hvsock, OID_AUTO, hvs_dbg_level, CTLFLAG_RWTUN, &hvs_dbg_level, 0, "hyperv socket debug level: 0 = none, 1 = info, 2 = error, 3 = verbose"); #define HVSOCK_DBG(level, ...) do { \ if (hvs_dbg_level >= (level)) \ printf(__VA_ARGS__); \ } while (0) MALLOC_DEFINE(M_HVSOCK, "hyperv_socket", "hyperv socket control structures"); /* The MTU is 16KB per host side's design */ #define HVSOCK_MTU_SIZE (1024 * 16) #define HVSOCK_SEND_BUF_SZ (PAGE_SIZE - sizeof(struct vmpipe_proto_header)) #define HVSOCK_HEADER_LEN (sizeof(struct hvs_pkt_header)) #define HVSOCK_PKT_LEN(payload_len) (HVSOCK_HEADER_LEN + \ roundup2(payload_len, 8) + \ sizeof(uint64_t)) static struct domain hv_socket_domain; /* * HyperV Transport sockets */ static struct pr_usrreqs hvs_trans_usrreqs = { .pru_attach = hvs_trans_attach, .pru_bind = hvs_trans_bind, .pru_listen = hvs_trans_listen, .pru_accept = hvs_trans_accept, .pru_connect = hvs_trans_connect, .pru_peeraddr = hvs_trans_peeraddr, .pru_sockaddr = hvs_trans_sockaddr, .pru_soreceive = hvs_trans_soreceive, .pru_sosend = hvs_trans_sosend, .pru_disconnect = hvs_trans_disconnect, .pru_close = hvs_trans_close, .pru_detach = hvs_trans_detach, .pru_shutdown = hvs_trans_shutdown, .pru_abort = hvs_trans_abort, }; /* * Definitions of protocols supported in HyperV socket domain */ static struct protosw hv_socket_protosw[] = { { .pr_type = SOCK_STREAM, .pr_domain = &hv_socket_domain, .pr_protocol = HYPERV_SOCK_PROTO_TRANS, .pr_flags = PR_CONNREQUIRED, .pr_init = hvs_trans_init, .pr_usrreqs = &hvs_trans_usrreqs, }, }; static struct domain hv_socket_domain = { .dom_family = AF_HYPERV, .dom_name = "hyperv", .dom_protosw = hv_socket_protosw, .dom_protoswNPROTOSW = &hv_socket_protosw[nitems(hv_socket_protosw)] }; VNET_DOMAIN_SET(hv_socket_); #define MAX_PORT ((uint32_t)0xFFFFFFFF) #define MIN_PORT ((uint32_t)0x0) /* 00000000-facb-11e6-bd58-64006a7986d3 */ static const struct hyperv_guid srv_id_template = { .hv_guid = { 0x00, 0x00, 0x00, 0x00, 0xcb, 0xfa, 0xe6, 0x11, 0xbd, 0x58, 0x64, 0x00, 0x6a, 0x79, 0x86, 0xd3 } }; static int hvsock_br_callback(void *, int, void *); static uint32_t hvsock_canread_check(struct hvs_pcb *); static uint32_t hvsock_canwrite_check(struct hvs_pcb *); static int hvsock_send_data(struct vmbus_channel *chan, struct uio *uio, uint32_t to_write, struct sockbuf *sb); /* Globals */ static struct sx hvs_trans_socks_sx; static struct mtx hvs_trans_socks_mtx; static LIST_HEAD(, hvs_pcb) hvs_trans_bound_socks; static LIST_HEAD(, hvs_pcb) hvs_trans_connected_socks; static uint32_t previous_auto_bound_port; static void hvsock_print_guid(struct hyperv_guid *guid) { unsigned char *p = (unsigned char *)guid; HVSOCK_DBG(HVSOCK_DBG_INFO, "0x%x-0x%x-0x%x-0x%x-0x%x-0x%x-0x%x-0x%x-0x%x-0x%x-0x%x\n", *(unsigned int *)p, *((unsigned short *) &p[4]), *((unsigned short *) &p[6]), p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]); } static bool is_valid_srv_id(const struct hyperv_guid *id) { return !memcmp(&id->hv_guid[4], &srv_id_template.hv_guid[4], sizeof(struct hyperv_guid) - 4); } static unsigned int get_port_by_srv_id(const struct hyperv_guid *srv_id) { return *((const unsigned int *)srv_id); } static void set_port_by_srv_id(struct hyperv_guid *srv_id, unsigned int port) { *((unsigned int *)srv_id) = port; } static void __hvs_remove_pcb_from_list(struct hvs_pcb *pcb, unsigned char list) { struct hvs_pcb *p = NULL; HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "%s: pcb is %p\n", __func__, pcb); if (!pcb) return; if (list & HVS_LIST_BOUND) { LIST_FOREACH(p, &hvs_trans_bound_socks, bound_next) if (p == pcb) LIST_REMOVE(p, bound_next); } if (list & HVS_LIST_CONNECTED) { LIST_FOREACH(p, &hvs_trans_connected_socks, connected_next) if (p == pcb) LIST_REMOVE(pcb, connected_next); } } static void __hvs_remove_socket_from_list(struct socket *so, unsigned char list) { struct hvs_pcb *pcb = so2hvspcb(so); HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "%s: pcb is %p\n", __func__, pcb); __hvs_remove_pcb_from_list(pcb, list); } static void __hvs_insert_socket_on_list(struct socket *so, unsigned char list) { struct hvs_pcb *pcb = so2hvspcb(so); if (list & HVS_LIST_BOUND) LIST_INSERT_HEAD(&hvs_trans_bound_socks, pcb, bound_next); if (list & HVS_LIST_CONNECTED) LIST_INSERT_HEAD(&hvs_trans_connected_socks, pcb, connected_next); } void hvs_remove_socket_from_list(struct socket *so, unsigned char list) { if (!so || !so->so_pcb) { HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "%s: socket or so_pcb is null\n", __func__); return; } mtx_lock(&hvs_trans_socks_mtx); __hvs_remove_socket_from_list(so, list); mtx_unlock(&hvs_trans_socks_mtx); } static void hvs_insert_socket_on_list(struct socket *so, unsigned char list) { if (!so || !so->so_pcb) { HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "%s: socket or so_pcb is null\n", __func__); return; } mtx_lock(&hvs_trans_socks_mtx); __hvs_insert_socket_on_list(so, list); mtx_unlock(&hvs_trans_socks_mtx); } static struct socket * __hvs_find_socket_on_list(struct sockaddr_hvs *addr, unsigned char list) { struct hvs_pcb *p = NULL; if (list & HVS_LIST_BOUND) LIST_FOREACH(p, &hvs_trans_bound_socks, bound_next) if (p->so != NULL && addr->hvs_port == p->local_addr.hvs_port) return p->so; if (list & HVS_LIST_CONNECTED) LIST_FOREACH(p, &hvs_trans_connected_socks, connected_next) if (p->so != NULL && addr->hvs_port == p->local_addr.hvs_port) return p->so; return NULL; } static struct socket * hvs_find_socket_on_list(struct sockaddr_hvs *addr, unsigned char list) { struct socket *s = NULL; mtx_lock(&hvs_trans_socks_mtx); s = __hvs_find_socket_on_list(addr, list); mtx_unlock(&hvs_trans_socks_mtx); return s; } static inline void hvs_addr_set(struct sockaddr_hvs *addr, unsigned int port) { memset(addr, 0, sizeof(*addr)); addr->sa_family = AF_HYPERV; addr->sa_len = sizeof(*addr); addr->hvs_port = port; } void hvs_addr_init(struct sockaddr_hvs *addr, const struct hyperv_guid *svr_id) { hvs_addr_set(addr, get_port_by_srv_id(svr_id)); } int hvs_trans_lock(void) { sx_xlock(&hvs_trans_socks_sx); return (0); } void hvs_trans_unlock(void) { sx_xunlock(&hvs_trans_socks_sx); } void hvs_trans_init(void) { /* Skip initialization of globals for non-default instances. */ if (!IS_DEFAULT_VNET(curvnet)) return; if (vm_guest != VM_GUEST_HV) return; HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "%s: HyperV Socket hvs_trans_init called\n", __func__); /* Initialize Globals */ previous_auto_bound_port = MAX_PORT; sx_init(&hvs_trans_socks_sx, "hvs_trans_sock_sx"); mtx_init(&hvs_trans_socks_mtx, "hvs_trans_socks_mtx", NULL, MTX_DEF); LIST_INIT(&hvs_trans_bound_socks); LIST_INIT(&hvs_trans_connected_socks); } /* * Called in two cases: * 1) When user calls socket(); * 2) When we accept new incoming conneciton and call sonewconn(). */ int hvs_trans_attach(struct socket *so, int proto, struct thread *td) { struct hvs_pcb *pcb = so2hvspcb(so); if (vm_guest != VM_GUEST_HV) return (ESOCKTNOSUPPORT); HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "%s: HyperV Socket hvs_trans_attach called\n", __func__); if (so->so_type != SOCK_STREAM) return (ESOCKTNOSUPPORT); if (proto != 0 && proto != HYPERV_SOCK_PROTO_TRANS) return (EPROTONOSUPPORT); if (pcb != NULL) return (EISCONN); pcb = malloc(sizeof(struct hvs_pcb), M_HVSOCK, M_NOWAIT | M_ZERO); if (pcb == NULL) return (ENOMEM); pcb->so = so; so->so_pcb = (void *)pcb; return (0); } void hvs_trans_detach(struct socket *so) { struct hvs_pcb *pcb; if (vm_guest != VM_GUEST_HV) return; HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "%s: HyperV Socket hvs_trans_detach called\n", __func__); (void) hvs_trans_lock(); pcb = so2hvspcb(so); if (pcb == NULL) { hvs_trans_unlock(); return; } if (SOLISTENING(so)) { bzero(pcb, sizeof(*pcb)); free(pcb, M_HVSOCK); } so->so_pcb = NULL; hvs_trans_unlock(); } int hvs_trans_bind(struct socket *so, struct sockaddr *addr, struct thread *td) { struct hvs_pcb *pcb = so2hvspcb(so); struct sockaddr_hvs *sa = (struct sockaddr_hvs *) addr; int error = 0; HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "%s: HyperV Socket hvs_trans_bind called\n", __func__); if (sa == NULL) { return (EINVAL); } if (pcb == NULL) { return (EINVAL); } if (sa->sa_family != AF_HYPERV) { HVSOCK_DBG(HVSOCK_DBG_ERR, "%s: Not supported, sa_family is %u\n", __func__, sa->sa_family); return (EAFNOSUPPORT); } if (sa->sa_len != sizeof(*sa)) { HVSOCK_DBG(HVSOCK_DBG_ERR, "%s: Not supported, sa_len is %u\n", __func__, sa->sa_len); return (EINVAL); } HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "%s: binding port = 0x%x\n", __func__, sa->hvs_port); mtx_lock(&hvs_trans_socks_mtx); if (__hvs_find_socket_on_list(sa, HVS_LIST_BOUND | HVS_LIST_CONNECTED)) { error = EADDRINUSE; } else { /* * The address is available for us to bind. * Add socket to the bound list. */ hvs_addr_set(&pcb->local_addr, sa->hvs_port); hvs_addr_set(&pcb->remote_addr, HVADDR_PORT_ANY); __hvs_insert_socket_on_list(so, HVS_LIST_BOUND); } mtx_unlock(&hvs_trans_socks_mtx); return (error); } int hvs_trans_listen(struct socket *so, int backlog, struct thread *td) { struct hvs_pcb *pcb = so2hvspcb(so); struct socket *bound_so; int error; HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "%s: HyperV Socket hvs_trans_listen called\n", __func__); if (pcb == NULL) return (EINVAL); /* Check if the address is already bound and it was by us. */ bound_so = hvs_find_socket_on_list(&pcb->local_addr, HVS_LIST_BOUND); if (bound_so == NULL || bound_so != so) { HVSOCK_DBG(HVSOCK_DBG_ERR, "%s: Address not bound or not by us.\n", __func__); return (EADDRNOTAVAIL); } SOCK_LOCK(so); error = solisten_proto_check(so); if (error == 0) solisten_proto(so, backlog); SOCK_UNLOCK(so); HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "%s: HyperV Socket listen error = %d\n", __func__, error); return (error); } int hvs_trans_accept(struct socket *so, struct sockaddr **nam) { struct hvs_pcb *pcb = so2hvspcb(so); HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "%s: HyperV Socket hvs_trans_accept called\n", __func__); if (pcb == NULL) return (EINVAL); *nam = sodupsockaddr((struct sockaddr *) &pcb->remote_addr, M_NOWAIT); return ((*nam == NULL) ? ENOMEM : 0); } int hvs_trans_connect(struct socket *so, struct sockaddr *nam, struct thread *td) { struct hvs_pcb *pcb = so2hvspcb(so); struct sockaddr_hvs *raddr = (struct sockaddr_hvs *)nam; bool found_auto_bound_port = false; int i, error = 0; HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "%s: HyperV Socket hvs_trans_connect called, remote port is %x\n", __func__, raddr->hvs_port); if (pcb == NULL) return (EINVAL); /* Verify the remote address */ if (raddr == NULL) return (EINVAL); if (raddr->sa_family != AF_HYPERV) return (EAFNOSUPPORT); if (raddr->sa_len != sizeof(*raddr)) return (EINVAL); mtx_lock(&hvs_trans_socks_mtx); if (so->so_state & (SS_ISCONNECTED|SS_ISDISCONNECTING|SS_ISCONNECTING)) { HVSOCK_DBG(HVSOCK_DBG_ERR, "%s: socket connect in progress\n", __func__); error = EINPROGRESS; goto out; } /* * Find an available port for us to auto bind the local * address. */ hvs_addr_set(&pcb->local_addr, 0); for (i = previous_auto_bound_port - 1; i != previous_auto_bound_port; i --) { if (i == MIN_PORT) i = MAX_PORT; pcb->local_addr.hvs_port = i; if (__hvs_find_socket_on_list(&pcb->local_addr, HVS_LIST_BOUND | HVS_LIST_CONNECTED) == NULL) { found_auto_bound_port = true; previous_auto_bound_port = i; HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "%s: found local bound port is %x\n", __func__, pcb->local_addr.hvs_port); break; } } if (found_auto_bound_port == true) { /* Found available port for auto bound, put on list */ __hvs_insert_socket_on_list(so, HVS_LIST_BOUND); /* Set VM service ID */ pcb->vm_srv_id = srv_id_template; set_port_by_srv_id(&pcb->vm_srv_id, pcb->local_addr.hvs_port); /* Set host service ID and remote port */ pcb->host_srv_id = srv_id_template; set_port_by_srv_id(&pcb->host_srv_id, raddr->hvs_port); hvs_addr_set(&pcb->remote_addr, raddr->hvs_port); /* Change the socket state to SS_ISCONNECTING */ soisconnecting(so); } else { HVSOCK_DBG(HVSOCK_DBG_ERR, "%s: No local port available for auto bound\n", __func__); error = EADDRINUSE; } HVSOCK_DBG(HVSOCK_DBG_INFO, "Connect vm_srv_id is "); hvsock_print_guid(&pcb->vm_srv_id); HVSOCK_DBG(HVSOCK_DBG_INFO, "Connect host_srv_id is "); hvsock_print_guid(&pcb->host_srv_id); out: mtx_unlock(&hvs_trans_socks_mtx); if (found_auto_bound_port == true) vmbus_req_tl_connect(&pcb->vm_srv_id, &pcb->host_srv_id); return (error); } int hvs_trans_disconnect(struct socket *so) { struct hvs_pcb *pcb; if (vm_guest != VM_GUEST_HV) return (ESOCKTNOSUPPORT); HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "%s: HyperV Socket hvs_trans_disconnect called\n", __func__); (void) hvs_trans_lock(); pcb = so2hvspcb(so); if (pcb == NULL) { hvs_trans_unlock(); return (EINVAL); } /* If socket is already disconnected, skip this */ if ((so->so_state & SS_ISDISCONNECTED) == 0) soisdisconnecting(so); hvs_trans_unlock(); return (0); } #define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT) struct hvs_callback_arg { struct uio *uio; struct sockbuf *sb; }; int hvs_trans_soreceive(struct socket *so, struct sockaddr **paddr, struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp) { struct hvs_pcb *pcb = so2hvspcb(so); struct sockbuf *sb; ssize_t orig_resid; uint32_t canread, to_read; int flags, error = 0; struct hvs_callback_arg cbarg; HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "%s: HyperV Socket hvs_trans_soreceive called\n", __func__); if (so->so_type != SOCK_STREAM) return (EINVAL); if (pcb == NULL) return (EINVAL); if (flagsp != NULL) flags = *flagsp &~ MSG_EOR; else flags = 0; if (flags & MSG_PEEK) return (EOPNOTSUPP); /* If no space to copy out anything */ if (uio->uio_resid == 0 || uio->uio_rw != UIO_READ) return (EINVAL); sb = &so->so_rcv; orig_resid = uio->uio_resid; /* Prevent other readers from entering the socket. */ error = sblock(sb, SBLOCKWAIT(flags)); if (error) { HVSOCK_DBG(HVSOCK_DBG_ERR, "%s: sblock returned error = %d\n", __func__, error); return (error); } SOCKBUF_LOCK(sb); cbarg.uio = uio; cbarg.sb = sb; /* * If the socket is closing, there might still be some data * in rx br to read. However we need to make sure * the channel is still open. */ if ((sb->sb_state & SBS_CANTRCVMORE) && (so->so_state & SS_ISDISCONNECTED)) { /* Other thread already closed the channel */ error = EPIPE; goto out; } while (true) { while (uio->uio_resid > 0 && (canread = hvsock_canread_check(pcb)) > 0) { to_read = MIN(canread, uio->uio_resid); HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "%s: to_read = %u, skip = %u\n", __func__, to_read, (unsigned int)(sizeof(struct hvs_pkt_header) + pcb->recv_data_off)); error = vmbus_chan_recv_peek_call(pcb->chan, to_read, sizeof(struct hvs_pkt_header) + pcb->recv_data_off, hvsock_br_callback, (void *)&cbarg); /* * It is possible socket is disconnected becasue * we released lock in hvsock_br_callback. So we * need to check the state to make sure it is not * disconnected. */ if (error || so->so_state & SS_ISDISCONNECTED) { break; } pcb->recv_data_len -= to_read; pcb->recv_data_off += to_read; } if (error) break; /* Abort if socket has reported problems. */ if (so->so_error) { if (so->so_error == ESHUTDOWN && orig_resid > uio->uio_resid) { /* * Although we got a FIN, we also received * some data in this round. Delivery it * to user. */ error = 0; } else { if (so->so_error != ESHUTDOWN) error = so->so_error; } break; } /* Cannot received more. */ if (sb->sb_state & SBS_CANTRCVMORE) break; /* We are done if buffer has been filled */ if (uio->uio_resid == 0) break; if (!(flags & MSG_WAITALL) && orig_resid > uio->uio_resid) break; /* Buffer ring is empty and we shall not block */ if ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO))) { if (orig_resid == uio->uio_resid) { /* We have not read anything */ error = EAGAIN; } HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "%s: non blocked read return, error %d.\n", __func__, error); break; } /* * Wait and block until (more) data comes in. * Note: Drops the sockbuf lock during wait. */ error = sbwait(sb); if (error) break; HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "%s: wake up from sbwait, read available is %u\n", __func__, vmbus_chan_read_available(pcb->chan)); } out: SOCKBUF_UNLOCK(sb); sbunlock(sb); /* We recieved a FIN in this call */ if (so->so_error == ESHUTDOWN) { if (so->so_snd.sb_state & SBS_CANTSENDMORE) { /* Send has already closed */ soisdisconnecting(so); } else { /* Just close the receive side */ socantrcvmore(so); } } HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "%s: returning error = %d, so_error = %d\n", __func__, error, so->so_error); return (error); } int hvs_trans_sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, struct mbuf *top, struct mbuf *controlp, int flags, struct thread *td) { struct hvs_pcb *pcb = so2hvspcb(so); struct sockbuf *sb; ssize_t orig_resid; uint32_t canwrite, to_write; int error = 0; HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "%s: HyperV Socket hvs_trans_sosend called, uio_resid = %zd\n", __func__, uio->uio_resid); if (so->so_type != SOCK_STREAM) return (EINVAL); if (pcb == NULL) return (EINVAL); /* If nothing to send */ if (uio->uio_resid == 0 || uio->uio_rw != UIO_WRITE) return (EINVAL); sb = &so->so_snd; orig_resid = uio->uio_resid; /* Prevent other writers from entering the socket. */ error = sblock(sb, SBLOCKWAIT(flags)); if (error) { HVSOCK_DBG(HVSOCK_DBG_ERR, "%s: sblock returned error = %d\n", __func__, error); return (error); } SOCKBUF_LOCK(sb); if ((sb->sb_state & SBS_CANTSENDMORE) || so->so_error == ESHUTDOWN) { error = EPIPE; goto out; } while (uio->uio_resid > 0) { canwrite = hvsock_canwrite_check(pcb); if (canwrite == 0) { /* We have sent some data */ if (orig_resid > uio->uio_resid) break; /* * We have not sent any data and it is * non-blocked io */ if (so->so_state & SS_NBIO || (flags & (MSG_NBIO | MSG_DONTWAIT)) != 0) { error = EWOULDBLOCK; break; } else { /* * We are here because there is no space on * send buffer ring. Signal the other side * to read and free more space. * Sleep wait until space avaiable to send * Note: Drops the sockbuf lock during wait. */ error = sbwait(sb); if (error) break; HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "%s: wake up from sbwait, space avail on " "tx ring is %u\n", __func__, vmbus_chan_write_available(pcb->chan)); continue; } } to_write = MIN(canwrite, uio->uio_resid); to_write = MIN(to_write, HVSOCK_SEND_BUF_SZ); HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "%s: canwrite is %u, to_write = %u\n", __func__, canwrite, to_write); error = hvsock_send_data(pcb->chan, uio, to_write, sb); if (error) break; } out: SOCKBUF_UNLOCK(sb); sbunlock(sb); return (error); } int hvs_trans_peeraddr(struct socket *so, struct sockaddr **nam) { struct hvs_pcb *pcb = so2hvspcb(so); HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "%s: HyperV Socket hvs_trans_peeraddr called\n", __func__); if (pcb == NULL) return (EINVAL); *nam = sodupsockaddr((struct sockaddr *) &pcb->remote_addr, M_NOWAIT); return ((*nam == NULL)? ENOMEM : 0); } int hvs_trans_sockaddr(struct socket *so, struct sockaddr **nam) { struct hvs_pcb *pcb = so2hvspcb(so); HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "%s: HyperV Socket hvs_trans_sockaddr called\n", __func__); if (pcb == NULL) return (EINVAL); *nam = sodupsockaddr((struct sockaddr *) &pcb->local_addr, M_NOWAIT); return ((*nam == NULL)? ENOMEM : 0); } void hvs_trans_close(struct socket *so) { struct hvs_pcb *pcb; if (vm_guest != VM_GUEST_HV) return; HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "%s: HyperV Socket hvs_trans_close called\n", __func__); (void) hvs_trans_lock(); pcb = so2hvspcb(so); if (!pcb) { hvs_trans_unlock(); return; } if (so->so_state & SS_ISCONNECTED) { /* Send a FIN to peer */ HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "%s: hvs_trans_close sending a FIN to host\n", __func__); (void) hvsock_send_data(pcb->chan, NULL, 0, NULL); } if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING)) soisdisconnected(so); pcb->chan = NULL; pcb->so = NULL; if (SOLISTENING(so)) { mtx_lock(&hvs_trans_socks_mtx); /* Remove from bound list */ __hvs_remove_socket_from_list(so, HVS_LIST_BOUND); mtx_unlock(&hvs_trans_socks_mtx); } hvs_trans_unlock(); return; } void hvs_trans_abort(struct socket *so) { struct hvs_pcb *pcb = so2hvspcb(so); if (vm_guest != VM_GUEST_HV) return; HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "%s: HyperV Socket hvs_trans_abort called\n", __func__); (void) hvs_trans_lock(); if (pcb == NULL) { hvs_trans_unlock(); return; } if (SOLISTENING(so)) { mtx_lock(&hvs_trans_socks_mtx); /* Remove from bound list */ __hvs_remove_socket_from_list(so, HVS_LIST_BOUND); mtx_unlock(&hvs_trans_socks_mtx); } if (so->so_state & SS_ISCONNECTED) { (void) sodisconnect(so); } hvs_trans_unlock(); return; } int hvs_trans_shutdown(struct socket *so) { struct hvs_pcb *pcb = so2hvspcb(so); struct sockbuf *sb; HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "%s: HyperV Socket hvs_trans_shutdown called\n", __func__); if (pcb == NULL) return (EINVAL); /* * Only get called with the shutdown method is SHUT_WR or * SHUT_RDWR. * When the method is SHUT_RD or SHUT_RDWR, the caller * already set the SBS_CANTRCVMORE on receive side socket * buffer. */ if ((so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) { /* * SHUT_WR only case. * Receive side is still open. Just close * the send side. */ socantsendmore(so); } else { /* SHUT_RDWR case */ if (so->so_state & SS_ISCONNECTED) { /* Send a FIN to peer */ sb = &so->so_snd; SOCKBUF_LOCK(sb); (void) hvsock_send_data(pcb->chan, NULL, 0, sb); SOCKBUF_UNLOCK(sb); soisdisconnecting(so); } } return (0); } /* In the VM, we support Hyper-V Sockets with AF_HYPERV, and the endpoint is * (see struct sockaddr_hvs). * * On the host, Hyper-V Sockets are supported by Winsock AF_HYPERV: * https://docs.microsoft.com/en-us/virtualization/hyper-v-on-windows/user- * guide/make-integration-service, and the endpoint is with * the below sockaddr: * * struct SOCKADDR_HV * { * ADDRESS_FAMILY Family; * USHORT Reserved; * GUID VmId; * GUID ServiceId; * }; * Note: VmID is not used by FreeBSD VM and actually it isn't transmitted via * VMBus, because here it's obvious the host and the VM can easily identify * each other. Though the VmID is useful on the host, especially in the case * of Windows container, FreeBSD VM doesn't need it at all. * * To be compatible with similar infrastructure in Linux VMs, we have * to limit the available GUID space of SOCKADDR_HV so that we can create * a mapping between FreeBSD AF_HYPERV port and SOCKADDR_HV Service GUID. * The rule of writing Hyper-V Sockets apps on the host and in FreeBSD VM is: * **************************************************************************** * The only valid Service GUIDs, from the perspectives of both the host and * * FreeBSD VM, that can be connected by the other end, must conform to this * * format: -facb-11e6-bd58-64006a7986d3. * **************************************************************************** * * When we write apps on the host to connect(), the GUID ServiceID is used. * When we write apps in FreeBSD VM to connect(), we only need to specify the * port and the driver will form the GUID and use that to request the host. * * From the perspective of FreeBSD VM, the remote ephemeral port (i.e. the * auto-generated remote port for a connect request initiated by the host's * connect()) is set to HVADDR_PORT_UNKNOWN, which is not realy used on the * FreeBSD guest. */ /* * Older HyperV hosts (vmbus version 'VMBUS_VERSION_WIN10' or before) * restricts HyperV socket ring buffer size to six 4K pages. Newer * HyperV hosts doen't have this limit. */ #define HVS_RINGBUF_RCV_SIZE (PAGE_SIZE * 6) #define HVS_RINGBUF_SND_SIZE (PAGE_SIZE * 6) #define HVS_RINGBUF_MAX_SIZE (PAGE_SIZE * 64) struct hvsock_sc { device_t dev; struct hvs_pcb *pcb; struct vmbus_channel *channel; }; static bool hvsock_chan_readable(struct vmbus_channel *chan) { uint32_t readable = vmbus_chan_read_available(chan); return (readable >= HVSOCK_PKT_LEN(0)); } static void hvsock_chan_cb(struct vmbus_channel *chan, void *context) { struct hvs_pcb *pcb = (struct hvs_pcb *) context; struct socket *so; uint32_t canwrite; HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "%s: host send us a wakeup on rb data, pcb = %p\n", __func__, pcb); /* * Check if the socket is still attached and valid. * Here we know channel is still open. Need to make * sure the socket has not been closed or freed. */ (void) hvs_trans_lock(); so = hsvpcb2so(pcb); if (pcb->chan != NULL && so != NULL) { /* * Wake up reader if there are data to read. */ SOCKBUF_LOCK(&(so)->so_rcv); HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "%s: read available = %u\n", __func__, vmbus_chan_read_available(pcb->chan)); if (hvsock_chan_readable(pcb->chan)) sorwakeup_locked(so); else SOCKBUF_UNLOCK(&(so)->so_rcv); /* * Wake up sender if space becomes available to write. */ SOCKBUF_LOCK(&(so)->so_snd); canwrite = hvsock_canwrite_check(pcb); HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "%s: canwrite = %u\n", __func__, canwrite); if (canwrite > 0) { sowwakeup_locked(so); } else { SOCKBUF_UNLOCK(&(so)->so_snd); } } hvs_trans_unlock(); return; } static int hvsock_br_callback(void *datap, int cplen, void *cbarg) { struct hvs_callback_arg *arg = (struct hvs_callback_arg *)cbarg; struct uio *uio = arg->uio; struct sockbuf *sb = arg->sb; int error = 0; if (cbarg == NULL || datap == NULL) return (EINVAL); HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "%s: called, uio_rw = %s, uio_resid = %zd, cplen = %u, " "datap = %p\n", __func__, (uio->uio_rw == UIO_READ) ? "read from br":"write to br", uio->uio_resid, cplen, datap); if (sb) SOCKBUF_UNLOCK(sb); error = uiomove(datap, cplen, uio); if (sb) SOCKBUF_LOCK(sb); HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "%s: after uiomove, uio_resid = %zd, error = %d\n", __func__, uio->uio_resid, error); return (error); } static int hvsock_send_data(struct vmbus_channel *chan, struct uio *uio, uint32_t to_write, struct sockbuf *sb) { struct hvs_pkt_header hvs_pkt; int hvs_pkthlen, hvs_pktlen, pad_pktlen, hlen, error = 0; uint64_t pad = 0; struct iovec iov[3]; struct hvs_callback_arg cbarg; if (chan == NULL) return (ENOTCONN); hlen = sizeof(struct vmbus_chanpkt_hdr); hvs_pkthlen = sizeof(struct hvs_pkt_header); hvs_pktlen = hvs_pkthlen + to_write; pad_pktlen = VMBUS_CHANPKT_TOTLEN(hvs_pktlen); HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "%s: hlen = %u, hvs_pkthlen = %u, hvs_pktlen = %u, " "pad_pktlen = %u, data_len = %u\n", __func__, hlen, hvs_pkthlen, hvs_pktlen, pad_pktlen, to_write); hvs_pkt.chan_pkt_hdr.cph_type = VMBUS_CHANPKT_TYPE_INBAND; hvs_pkt.chan_pkt_hdr.cph_flags = 0; VMBUS_CHANPKT_SETLEN(hvs_pkt.chan_pkt_hdr.cph_hlen, hlen); VMBUS_CHANPKT_SETLEN(hvs_pkt.chan_pkt_hdr.cph_tlen, pad_pktlen); hvs_pkt.chan_pkt_hdr.cph_xactid = 0; hvs_pkt.vmpipe_pkt_hdr.vmpipe_pkt_type = 1; hvs_pkt.vmpipe_pkt_hdr.vmpipe_data_size = to_write; cbarg.uio = uio; cbarg.sb = sb; if (uio && to_write > 0) { iov[0].iov_base = &hvs_pkt; iov[0].iov_len = hvs_pkthlen; iov[1].iov_base = NULL; iov[1].iov_len = to_write; iov[2].iov_base = &pad; iov[2].iov_len = pad_pktlen - hvs_pktlen; error = vmbus_chan_iov_send(chan, iov, 3, hvsock_br_callback, &cbarg); } else { if (to_write == 0) { iov[0].iov_base = &hvs_pkt; iov[0].iov_len = hvs_pkthlen; iov[1].iov_base = &pad; iov[1].iov_len = pad_pktlen - hvs_pktlen; error = vmbus_chan_iov_send(chan, iov, 2, NULL, NULL); } } if (error) { HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "%s: error = %d\n", __func__, error); } return (error); } /* * Check if we have data on current ring buffer to read * or not. If not, advance the ring buffer read index to * next packet. Update the recev_data_len and recev_data_off * to new value. * Return the number of bytes can read. */ static uint32_t hvsock_canread_check(struct hvs_pcb *pcb) { uint32_t advance; uint32_t tlen, hlen, dlen; uint32_t bytes_canread = 0; int error; if (pcb == NULL || pcb->chan == NULL) { pcb->so->so_error = EIO; return (0); } /* Still have data not read yet on current packet */ if (pcb->recv_data_len > 0) return (pcb->recv_data_len); if (pcb->rb_init) advance = VMBUS_CHANPKT_GETLEN(pcb->hvs_pkt.chan_pkt_hdr.cph_tlen); else advance = 0; bytes_canread = vmbus_chan_read_available(pcb->chan); HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "%s: bytes_canread on br = %u, advance = %u\n", __func__, bytes_canread, advance); if (pcb->rb_init && bytes_canread == (advance + sizeof(uint64_t))) { /* * Nothing to read. Need to advance the rindex before * calling sbwait, so host knows to wake us up when data * is available to read on rb. */ error = vmbus_chan_recv_idxadv(pcb->chan, advance); if (error) { HVSOCK_DBG(HVSOCK_DBG_ERR, "%s: after calling vmbus_chan_recv_idxadv, " "got error = %d\n", __func__, error); return (0); } else { pcb->rb_init = false; pcb->recv_data_len = 0; pcb->recv_data_off = 0; bytes_canread = vmbus_chan_read_available(pcb->chan); HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "%s: advanced %u bytes, " " bytes_canread on br now = %u\n", __func__, advance, bytes_canread); if (bytes_canread == 0) return (0); else advance = 0; } } if (bytes_canread < advance + (sizeof(struct hvs_pkt_header) + sizeof(uint64_t))) return (0); error = vmbus_chan_recv_peek(pcb->chan, &pcb->hvs_pkt, sizeof(struct hvs_pkt_header), advance); /* Don't have anything to read */ if (error) { HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "%s: after calling vmbus_chan_recv_peek, got error = %d\n", __func__, error); return (0); } /* * We just read in a new packet header. Do some sanity checks. */ tlen = VMBUS_CHANPKT_GETLEN(pcb->hvs_pkt.chan_pkt_hdr.cph_tlen); hlen = VMBUS_CHANPKT_GETLEN(pcb->hvs_pkt.chan_pkt_hdr.cph_hlen); dlen = pcb->hvs_pkt.vmpipe_pkt_hdr.vmpipe_data_size; if (__predict_false(hlen < sizeof(struct vmbus_chanpkt_hdr)) || __predict_false(hlen > tlen) || __predict_false(tlen < dlen + sizeof(struct hvs_pkt_header))) { HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "invalid tlen(%u), hlen(%u) or dlen(%u)\n", tlen, hlen, dlen); pcb->so->so_error = EIO; return (0); } if (pcb->rb_init == false) pcb->rb_init = true; HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "Got new pkt tlen(%u), hlen(%u) or dlen(%u)\n", tlen, hlen, dlen); /* The other side has sent a close FIN */ if (dlen == 0) { HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "%s: Received FIN from other side\n", __func__); /* inform the caller by seting so_error to ESHUTDOWN */ pcb->so->so_error = ESHUTDOWN; } HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "%s: canread on receive ring is %u \n", __func__, dlen); pcb->recv_data_len = dlen; pcb->recv_data_off = 0; return (pcb->recv_data_len); } static uint32_t hvsock_canwrite_check(struct hvs_pcb *pcb) { uint32_t writeable; uint32_t ret; if (pcb == NULL || pcb->chan == NULL) return (0); writeable = vmbus_chan_write_available(pcb->chan); /* * We must always reserve a 0-length-payload packet for the FIN. */ HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "%s: writeable is %u, should be greater than %ju\n", __func__, writeable, (uintmax_t)(HVSOCK_PKT_LEN(1) + HVSOCK_PKT_LEN(0))); if (writeable < HVSOCK_PKT_LEN(1) + HVSOCK_PKT_LEN(0)) { /* * The Tx ring seems full. */ return (0); } ret = writeable - HVSOCK_PKT_LEN(0) - HVSOCK_PKT_LEN(0); HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "%s: available size is %u\n", __func__, rounddown2(ret, 8)); return (rounddown2(ret, 8)); } static void hvsock_set_chan_pending_send_size(struct vmbus_channel *chan) { vmbus_chan_set_pending_send_size(chan, HVSOCK_PKT_LEN(HVSOCK_SEND_BUF_SZ)); } static int hvsock_open_channel(struct vmbus_channel *chan, struct socket *so) { unsigned int rcvbuf, sndbuf; struct hvs_pcb *pcb = so2hvspcb(so); int ret; if (vmbus_current_version < VMBUS_VERSION_WIN10_V5) { sndbuf = HVS_RINGBUF_SND_SIZE; rcvbuf = HVS_RINGBUF_RCV_SIZE; } else { sndbuf = MAX(so->so_snd.sb_hiwat, HVS_RINGBUF_SND_SIZE); sndbuf = MIN(sndbuf, HVS_RINGBUF_MAX_SIZE); sndbuf = rounddown2(sndbuf, PAGE_SIZE); rcvbuf = MAX(so->so_rcv.sb_hiwat, HVS_RINGBUF_RCV_SIZE); rcvbuf = MIN(rcvbuf, HVS_RINGBUF_MAX_SIZE); rcvbuf = rounddown2(rcvbuf, PAGE_SIZE); } /* * Can only read whatever user provided size of data * from ring buffer. Turn off batched reading. */ vmbus_chan_set_readbatch(chan, false); ret = vmbus_chan_open(chan, sndbuf, rcvbuf, NULL, 0, hvsock_chan_cb, pcb); if (ret != 0) { HVSOCK_DBG(HVSOCK_DBG_ERR, "%s: failed to open hvsock channel, sndbuf = %u, " "rcvbuf = %u\n", __func__, sndbuf, rcvbuf); } else { HVSOCK_DBG(HVSOCK_DBG_INFO, "%s: hvsock channel opened, sndbuf = %u, i" "rcvbuf = %u\n", __func__, sndbuf, rcvbuf); /* * Se the pending send size so to receive wakeup * signals from host when there is enough space on * rx buffer ring to write. */ hvsock_set_chan_pending_send_size(chan); } return ret; } /* * Guest is listening passively on the socket. Open channel and * create a new socket for the conneciton. */ static void hvsock_open_conn_passive(struct vmbus_channel *chan, struct socket *so, struct hvsock_sc *sc) { struct socket *new_so; struct hvs_pcb *new_pcb, *pcb; int error; /* Do nothing if socket is not listening */ - if ((so->so_options & SO_ACCEPTCONN) == 0) { + if (!SOLISTENING(so)) { HVSOCK_DBG(HVSOCK_DBG_ERR, "%s: socket is not a listening one\n", __func__); return; } /* * Create a new socket. This will call pru_attach to complete * the socket initialization and put the new socket onto * listening socket's sol_incomp list, waiting to be promoted * to sol_comp list. * The new socket created has ref count 0. There is no other * thread that changes the state of this new one at the * moment, so we don't need to hold its lock while opening * channel and filling out its pcb information. */ new_so = sonewconn(so, 0); if (!new_so) HVSOCK_DBG(HVSOCK_DBG_ERR, "%s: creating new socket failed\n", __func__); /* * Now open the vmbus channel. If it fails, the socket will be * on the listening socket's sol_incomp queue until it is * replaced and aborted. */ error = hvsock_open_channel(chan, new_so); if (error) { new_so->so_error = error; return; } pcb = so->so_pcb; new_pcb = new_so->so_pcb; hvs_addr_set(&(new_pcb->local_addr), pcb->local_addr.hvs_port); /* Remote port is unknown to guest in this type of conneciton */ hvs_addr_set(&(new_pcb->remote_addr), HVADDR_PORT_UNKNOWN); new_pcb->chan = chan; new_pcb->recv_data_len = 0; new_pcb->recv_data_off = 0; new_pcb->rb_init = false; new_pcb->vm_srv_id = *vmbus_chan_guid_type(chan); new_pcb->host_srv_id = *vmbus_chan_guid_inst(chan); hvs_insert_socket_on_list(new_so, HVS_LIST_CONNECTED); sc->pcb = new_pcb; /* * Change the socket state to SS_ISCONNECTED. This will promote * the socket to sol_comp queue and wake up the thread which * is accepting connection. */ soisconnected(new_so); } /* * Guest is actively connecting to host. */ static void hvsock_open_conn_active(struct vmbus_channel *chan, struct socket *so) { struct hvs_pcb *pcb; int error; error = hvsock_open_channel(chan, so); if (error) { so->so_error = error; return; } pcb = so->so_pcb; pcb->chan = chan; pcb->recv_data_len = 0; pcb->recv_data_off = 0; pcb->rb_init = false; mtx_lock(&hvs_trans_socks_mtx); __hvs_remove_socket_from_list(so, HVS_LIST_BOUND); __hvs_insert_socket_on_list(so, HVS_LIST_CONNECTED); mtx_unlock(&hvs_trans_socks_mtx); /* * Change the socket state to SS_ISCONNECTED. This will wake up * the thread sleeping in connect call. */ soisconnected(so); } static void hvsock_open_connection(struct vmbus_channel *chan, struct hvsock_sc *sc) { struct hyperv_guid *inst_guid, *type_guid; bool conn_from_host; struct sockaddr_hvs addr; struct socket *so; struct hvs_pcb *pcb; type_guid = (struct hyperv_guid *) vmbus_chan_guid_type(chan); inst_guid = (struct hyperv_guid *) vmbus_chan_guid_inst(chan); conn_from_host = vmbus_chan_is_hvs_conn_from_host(chan); HVSOCK_DBG(HVSOCK_DBG_INFO, "type_guid is "); hvsock_print_guid(type_guid); HVSOCK_DBG(HVSOCK_DBG_INFO, "inst_guid is "); hvsock_print_guid(inst_guid); HVSOCK_DBG(HVSOCK_DBG_INFO, "connection %s host\n", (conn_from_host == true ) ? "from" : "to"); /* * The listening port should be in [0, MAX_LISTEN_PORT] */ if (!is_valid_srv_id(type_guid)) return; /* * There should be a bound socket already created no matter * it is a passive or active connection. * For host initiated connection (passive on guest side), * the type_guid contains the port which guest is bound and * listening. * For the guest initiated connection (active on guest side), * the inst_guid contains the port that guest has auto bound * to. */ hvs_addr_init(&addr, conn_from_host ? type_guid : inst_guid); so = hvs_find_socket_on_list(&addr, HVS_LIST_BOUND); if (!so) { HVSOCK_DBG(HVSOCK_DBG_ERR, "%s: no bound socket found for port %u\n", __func__, addr.hvs_port); return; } if (conn_from_host) { hvsock_open_conn_passive(chan, so, sc); } else { (void) hvs_trans_lock(); pcb = so->so_pcb; if (pcb && pcb->so) { sc->pcb = so2hvspcb(so); hvsock_open_conn_active(chan, so); } else { HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "%s: channel detached before open\n", __func__); } hvs_trans_unlock(); } } static int hvsock_probe(device_t dev) { struct vmbus_channel *channel = vmbus_get_channel(dev); if (!channel || !vmbus_chan_is_hvs(channel)) { HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "hvsock_probe called but not a hvsock channel id %u\n", vmbus_chan_id(channel)); return ENXIO; } else { HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "hvsock_probe got a hvsock channel id %u\n", vmbus_chan_id(channel)); return BUS_PROBE_DEFAULT; } } static int hvsock_attach(device_t dev) { struct vmbus_channel *channel = vmbus_get_channel(dev); struct hvsock_sc *sc = (struct hvsock_sc *)device_get_softc(dev); HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "hvsock_attach called.\n"); hvsock_open_connection(channel, sc); /* * Always return success. On error the host will rescind the device * in 30 seconds and we can do cleanup at that time in * vmbus_chan_msgproc_chrescind(). */ return (0); } static int hvsock_detach(device_t dev) { struct hvsock_sc *sc = (struct hvsock_sc *)device_get_softc(dev); struct socket *so; int error, retry; if (bootverbose) device_printf(dev, "hvsock_detach called.\n"); HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "hvsock_detach called.\n"); if (sc->pcb != NULL) { (void) hvs_trans_lock(); so = hsvpcb2so(sc->pcb); if (so) { /* Close the connection */ if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING)) soisdisconnected(so); } mtx_lock(&hvs_trans_socks_mtx); __hvs_remove_pcb_from_list(sc->pcb, HVS_LIST_BOUND | HVS_LIST_CONNECTED); mtx_unlock(&hvs_trans_socks_mtx); /* * Close channel while no reader and sender are working * on the buffer rings. */ if (so) { retry = 0; while ((error = sblock(&so->so_rcv, 0)) == EWOULDBLOCK) { /* * Someone is reading, rx br is busy */ soisdisconnected(so); DELAY(500); HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "waiting for rx reader to exit, " "retry = %d\n", retry++); } retry = 0; while ((error = sblock(&so->so_snd, 0)) == EWOULDBLOCK) { /* * Someone is sending, tx br is busy */ soisdisconnected(so); DELAY(500); HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "waiting for tx sender to exit, " "retry = %d\n", retry++); } } bzero(sc->pcb, sizeof(struct hvs_pcb)); free(sc->pcb, M_HVSOCK); sc->pcb = NULL; if (so) { sbunlock(&so->so_rcv); sbunlock(&so->so_snd); so->so_pcb = NULL; } hvs_trans_unlock(); } vmbus_chan_close(vmbus_get_channel(dev)); return (0); } static device_method_t hvsock_methods[] = { /* Device interface */ DEVMETHOD(device_probe, hvsock_probe), DEVMETHOD(device_attach, hvsock_attach), DEVMETHOD(device_detach, hvsock_detach), DEVMETHOD_END }; static driver_t hvsock_driver = { "hv_sock", hvsock_methods, sizeof(struct hvsock_sc) }; static devclass_t hvsock_devclass; DRIVER_MODULE(hvsock, vmbus, hvsock_driver, hvsock_devclass, NULL, NULL); MODULE_VERSION(hvsock, 1); MODULE_DEPEND(hvsock, vmbus, 1, 1, 1); diff --git a/sys/kern/uipc_accf.c b/sys/kern/uipc_accf.c index 3ca64dd21e25..792c53c7baff 100644 --- a/sys/kern/uipc_accf.c +++ b/sys/kern/uipc_accf.c @@ -1,310 +1,309 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2000 Paycounter, Inc. * Copyright (c) 2005 Robert N. M. Watson * Author: Alfred Perlstein , * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #define ACCEPT_FILTER_MOD #include "opt_param.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include static struct mtx accept_filter_mtx; MTX_SYSINIT(accept_filter, &accept_filter_mtx, "accept_filter_mtx", MTX_DEF); #define ACCEPT_FILTER_LOCK() mtx_lock(&accept_filter_mtx) #define ACCEPT_FILTER_UNLOCK() mtx_unlock(&accept_filter_mtx) static SLIST_HEAD(, accept_filter) accept_filtlsthd = SLIST_HEAD_INITIALIZER(accept_filtlsthd); MALLOC_DEFINE(M_ACCF, "accf", "accept filter data"); static int unloadable = 0; SYSCTL_NODE(_net, OID_AUTO, accf, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Accept filters"); SYSCTL_INT(_net_accf, OID_AUTO, unloadable, CTLFLAG_RW, &unloadable, 0, "Allow unload of accept filters (not recommended)"); /* * Must be passed a malloc'd structure so we don't explode if the kld is * unloaded, we leak the struct on deallocation to deal with this, but if a * filter is loaded with the same name as a leaked one we re-use the entry. */ int accept_filt_add(struct accept_filter *filt) { struct accept_filter *p; ACCEPT_FILTER_LOCK(); SLIST_FOREACH(p, &accept_filtlsthd, accf_next) if (strcmp(p->accf_name, filt->accf_name) == 0) { if (p->accf_callback != NULL) { ACCEPT_FILTER_UNLOCK(); return (EEXIST); } else { p->accf_callback = filt->accf_callback; ACCEPT_FILTER_UNLOCK(); free(filt, M_ACCF); return (0); } } if (p == NULL) SLIST_INSERT_HEAD(&accept_filtlsthd, filt, accf_next); ACCEPT_FILTER_UNLOCK(); return (0); } int accept_filt_del(char *name) { struct accept_filter *p; p = accept_filt_get(name); if (p == NULL) return (ENOENT); p->accf_callback = NULL; return (0); } struct accept_filter * accept_filt_get(char *name) { struct accept_filter *p; ACCEPT_FILTER_LOCK(); SLIST_FOREACH(p, &accept_filtlsthd, accf_next) if (strcmp(p->accf_name, name) == 0) break; ACCEPT_FILTER_UNLOCK(); return (p); } int accept_filt_generic_mod_event(module_t mod, int event, void *data) { struct accept_filter *p; struct accept_filter *accfp = (struct accept_filter *) data; int error; switch (event) { case MOD_LOAD: p = malloc(sizeof(*p), M_ACCF, M_WAITOK); bcopy(accfp, p, sizeof(*p)); error = accept_filt_add(p); break; case MOD_UNLOAD: /* * Do not support unloading yet. we don't keep track of * refcounts and unloading an accept filter callback and then * having it called is a bad thing. A simple fix would be to * track the refcount in the struct accept_filter. */ if (unloadable != 0) { error = accept_filt_del(accfp->accf_name); } else error = EOPNOTSUPP; break; case MOD_SHUTDOWN: error = 0; break; default: error = EOPNOTSUPP; break; } return (error); } int accept_filt_getopt(struct socket *so, struct sockopt *sopt) { struct accept_filter_arg *afap; int error; error = 0; afap = malloc(sizeof(*afap), M_TEMP, M_WAITOK | M_ZERO); SOCK_LOCK(so); - if ((so->so_options & SO_ACCEPTCONN) == 0) { + if (!SOLISTENING(so)) { error = EINVAL; goto out; } if (so->sol_accept_filter == NULL) { error = EINVAL; goto out; } strcpy(afap->af_name, so->sol_accept_filter->accf_name); if (so->sol_accept_filter_str != NULL) strcpy(afap->af_arg, so->sol_accept_filter_str); out: SOCK_UNLOCK(so); if (error == 0) error = sooptcopyout(sopt, afap, sizeof(*afap)); free(afap, M_TEMP); return (error); } int accept_filt_setopt(struct socket *so, struct sockopt *sopt) { struct accept_filter_arg *afap; struct accept_filter *afp; char *accept_filter_str = NULL; void *accept_filter_arg = NULL; int error; /* * Handle the simple delete case first. */ if (sopt == NULL || sopt->sopt_val == NULL) { struct socket *sp, *sp1; int wakeup; SOCK_LOCK(so); - if ((so->so_options & SO_ACCEPTCONN) == 0) { + if (!SOLISTENING(so)) { SOCK_UNLOCK(so); return (EINVAL); } if (so->sol_accept_filter == NULL) { SOCK_UNLOCK(so); return (0); } if (so->sol_accept_filter->accf_destroy != NULL) so->sol_accept_filter->accf_destroy(so); if (so->sol_accept_filter_str != NULL) free(so->sol_accept_filter_str, M_ACCF); so->sol_accept_filter = NULL; so->sol_accept_filter_arg = NULL; so->sol_accept_filter_str = NULL; so->so_options &= ~SO_ACCEPTFILTER; /* * Move from incomplete queue to complete only those * connections, that are blocked by us. */ wakeup = 0; TAILQ_FOREACH_SAFE(sp, &so->sol_incomp, so_list, sp1) { SOCK_LOCK(sp); if (sp->so_options & SO_ACCEPTFILTER) { TAILQ_REMOVE(&so->sol_incomp, sp, so_list); TAILQ_INSERT_TAIL(&so->sol_comp, sp, so_list); sp->so_qstate = SQ_COMP; sp->so_options &= ~SO_ACCEPTFILTER; so->sol_incqlen--; so->sol_qlen++; wakeup = 1; } SOCK_UNLOCK(sp); } if (wakeup) solisten_wakeup(so); /* unlocks */ else SOLISTEN_UNLOCK(so); return (0); } /* * Pre-allocate any memory we may need later to avoid blocking at * untimely moments. This does not optimize for invalid arguments. */ afap = malloc(sizeof(*afap), M_TEMP, M_WAITOK); error = sooptcopyin(sopt, afap, sizeof *afap, sizeof *afap); afap->af_name[sizeof(afap->af_name)-1] = '\0'; afap->af_arg[sizeof(afap->af_arg)-1] = '\0'; if (error) { free(afap, M_TEMP); return (error); } afp = accept_filt_get(afap->af_name); if (afp == NULL) { free(afap, M_TEMP); return (ENOENT); } if (afp->accf_create != NULL && afap->af_name[0] != '\0') { size_t len = strlen(afap->af_name) + 1; accept_filter_str = malloc(len, M_ACCF, M_WAITOK); strcpy(accept_filter_str, afap->af_name); } /* * Require a listen socket; don't try to replace an existing filter * without first removing it. */ SOCK_LOCK(so); - if ((so->so_options & SO_ACCEPTCONN) == 0 || - so->sol_accept_filter != NULL) { + if (!SOLISTENING(so) || so->sol_accept_filter != NULL) { error = EINVAL; goto out; } /* * Invoke the accf_create() method of the filter if required. The * socket mutex is held over this call, so create methods for filters * can't block. */ if (afp->accf_create != NULL) { accept_filter_arg = afp->accf_create(so, afap->af_arg); if (accept_filter_arg == NULL) { error = EINVAL; goto out; } } so->sol_accept_filter = afp; so->sol_accept_filter_arg = accept_filter_arg; so->sol_accept_filter_str = accept_filter_str; accept_filter_str = NULL; so->so_options |= SO_ACCEPTFILTER; out: SOCK_UNLOCK(so); if (accept_filter_str != NULL) free(accept_filter_str, M_ACCF); free(afap, M_TEMP); return (error); } diff --git a/sys/kern/uipc_socket.c b/sys/kern/uipc_socket.c index 94d7782b5f0a..46d9cb8f3a90 100644 --- a/sys/kern/uipc_socket.c +++ b/sys/kern/uipc_socket.c @@ -1,4421 +1,4421 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1988, 1990, 1993 * The Regents of the University of California. * Copyright (c) 2004 The FreeBSD Foundation * Copyright (c) 2004-2008 Robert N. M. Watson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94 */ /* * Comments on the socket life cycle: * * soalloc() sets of socket layer state for a socket, called only by * socreate() and sonewconn(). Socket layer private. * * sodealloc() tears down socket layer state for a socket, called only by * sofree() and sonewconn(). Socket layer private. * * pru_attach() associates protocol layer state with an allocated socket; * called only once, may fail, aborting socket allocation. This is called * from socreate() and sonewconn(). Socket layer private. * * pru_detach() disassociates protocol layer state from an attached socket, * and will be called exactly once for sockets in which pru_attach() has * been successfully called. If pru_attach() returned an error, * pru_detach() will not be called. Socket layer private. * * pru_abort() and pru_close() notify the protocol layer that the last * consumer of a socket is starting to tear down the socket, and that the * protocol should terminate the connection. Historically, pru_abort() also * detached protocol state from the socket state, but this is no longer the * case. * * socreate() creates a socket and attaches protocol state. This is a public * interface that may be used by socket layer consumers to create new * sockets. * * sonewconn() creates a socket and attaches protocol state. This is a * public interface that may be used by protocols to create new sockets when * a new connection is received and will be available for accept() on a * listen socket. * * soclose() destroys a socket after possibly waiting for it to disconnect. * This is a public interface that socket consumers should use to close and * release a socket when done with it. * * soabort() destroys a socket without waiting for it to disconnect (used * only for incoming connections that are already partially or fully * connected). This is used internally by the socket layer when clearing * listen socket queues (due to overflow or close on the listen socket), but * is also a public interface protocols may use to abort connections in * their incomplete listen queues should they no longer be required. Sockets * placed in completed connection listen queues should not be aborted for * reasons described in the comment above the soclose() implementation. This * is not a general purpose close routine, and except in the specific * circumstances described here, should not be used. * * sofree() will free a socket and its protocol state if all references on * the socket have been released, and is the public interface to attempt to * free a socket when a reference is removed. This is a socket layer private * interface. * * NOTE: In addition to socreate() and soclose(), which provide a single * socket reference to the consumer to be managed as required, there are two * calls to explicitly manage socket references, soref(), and sorele(). * Currently, these are generally required only when transitioning a socket * from a listen queue to a file descriptor, in order to prevent garbage * collection of the socket at an untimely moment. For a number of reasons, * these interfaces are not preferred, and should be avoided. * * NOTE: With regard to VNETs the general rule is that callers do not set * curvnet. Exceptions to this rule include soabort(), sodisconnect(), * sofree() (and with that sorele(), sotryfree()), as well as sonewconn() * and sorflush(), which are usually called from a pre-set VNET context. * sopoll() currently does not need a VNET context to be set. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_kern_tls.h" #include "opt_sctp.h" #include #include #include #include #include #include #include #include #include #include #include #include /* for struct knote */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef COMPAT_FREEBSD32 #include #include #include #endif static int soreceive_rcvoob(struct socket *so, struct uio *uio, int flags); static void so_rdknl_lock(void *); static void so_rdknl_unlock(void *); static void so_rdknl_assert_lock(void *, int); static void so_wrknl_lock(void *); static void so_wrknl_unlock(void *); static void so_wrknl_assert_lock(void *, int); static void filt_sordetach(struct knote *kn); static int filt_soread(struct knote *kn, long hint); static void filt_sowdetach(struct knote *kn); static int filt_sowrite(struct knote *kn, long hint); static int filt_soempty(struct knote *kn, long hint); static int inline hhook_run_socket(struct socket *so, void *hctx, int32_t h_id); fo_kqfilter_t soo_kqfilter; static struct filterops soread_filtops = { .f_isfd = 1, .f_detach = filt_sordetach, .f_event = filt_soread, }; static struct filterops sowrite_filtops = { .f_isfd = 1, .f_detach = filt_sowdetach, .f_event = filt_sowrite, }; static struct filterops soempty_filtops = { .f_isfd = 1, .f_detach = filt_sowdetach, .f_event = filt_soempty, }; so_gen_t so_gencnt; /* generation count for sockets */ MALLOC_DEFINE(M_SONAME, "soname", "socket name"); MALLOC_DEFINE(M_PCB, "pcb", "protocol control block"); #define VNET_SO_ASSERT(so) \ VNET_ASSERT(curvnet != NULL, \ ("%s:%d curvnet is NULL, so=%p", __func__, __LINE__, (so))); VNET_DEFINE(struct hhook_head *, socket_hhh[HHOOK_SOCKET_LAST + 1]); #define V_socket_hhh VNET(socket_hhh) /* * Limit on the number of connections in the listen queue waiting * for accept(2). * NB: The original sysctl somaxconn is still available but hidden * to prevent confusion about the actual purpose of this number. */ static u_int somaxconn = SOMAXCONN; static int sysctl_somaxconn(SYSCTL_HANDLER_ARGS) { int error; int val; val = somaxconn; error = sysctl_handle_int(oidp, &val, 0, req); if (error || !req->newptr ) return (error); /* * The purpose of the UINT_MAX / 3 limit, is so that the formula * 3 * so_qlimit / 2 * below, will not overflow. */ if (val < 1 || val > UINT_MAX / 3) return (EINVAL); somaxconn = val; return (0); } SYSCTL_PROC(_kern_ipc, OID_AUTO, soacceptqueue, CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 0, sizeof(int), sysctl_somaxconn, "I", "Maximum listen socket pending connection accept queue size"); SYSCTL_PROC(_kern_ipc, KIPC_SOMAXCONN, somaxconn, CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_SKIP | CTLFLAG_NEEDGIANT, 0, sizeof(int), sysctl_somaxconn, "I", "Maximum listen socket pending connection accept queue size (compat)"); static int numopensockets; SYSCTL_INT(_kern_ipc, OID_AUTO, numopensockets, CTLFLAG_RD, &numopensockets, 0, "Number of open sockets"); /* * accept_mtx locks down per-socket fields relating to accept queues. See * socketvar.h for an annotation of the protected fields of struct socket. */ struct mtx accept_mtx; MTX_SYSINIT(accept_mtx, &accept_mtx, "accept", MTX_DEF); /* * so_global_mtx protects so_gencnt, numopensockets, and the per-socket * so_gencnt field. */ static struct mtx so_global_mtx; MTX_SYSINIT(so_global_mtx, &so_global_mtx, "so_glabel", MTX_DEF); /* * General IPC sysctl name space, used by sockets and a variety of other IPC * types. */ SYSCTL_NODE(_kern, KERN_IPC, ipc, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "IPC"); /* * Initialize the socket subsystem and set up the socket * memory allocator. */ static uma_zone_t socket_zone; int maxsockets; static void socket_zone_change(void *tag) { maxsockets = uma_zone_set_max(socket_zone, maxsockets); } static void socket_hhook_register(int subtype) { if (hhook_head_register(HHOOK_TYPE_SOCKET, subtype, &V_socket_hhh[subtype], HHOOK_NOWAIT|HHOOK_HEADISINVNET) != 0) printf("%s: WARNING: unable to register hook\n", __func__); } static void socket_hhook_deregister(int subtype) { if (hhook_head_deregister(V_socket_hhh[subtype]) != 0) printf("%s: WARNING: unable to deregister hook\n", __func__); } static void socket_init(void *tag) { socket_zone = uma_zcreate("socket", sizeof(struct socket), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); maxsockets = uma_zone_set_max(socket_zone, maxsockets); uma_zone_set_warning(socket_zone, "kern.ipc.maxsockets limit reached"); EVENTHANDLER_REGISTER(maxsockets_change, socket_zone_change, NULL, EVENTHANDLER_PRI_FIRST); } SYSINIT(socket, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, socket_init, NULL); static void socket_vnet_init(const void *unused __unused) { int i; /* We expect a contiguous range */ for (i = 0; i <= HHOOK_SOCKET_LAST; i++) socket_hhook_register(i); } VNET_SYSINIT(socket_vnet_init, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, socket_vnet_init, NULL); static void socket_vnet_uninit(const void *unused __unused) { int i; for (i = 0; i <= HHOOK_SOCKET_LAST; i++) socket_hhook_deregister(i); } VNET_SYSUNINIT(socket_vnet_uninit, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, socket_vnet_uninit, NULL); /* * Initialise maxsockets. This SYSINIT must be run after * tunable_mbinit(). */ static void init_maxsockets(void *ignored) { TUNABLE_INT_FETCH("kern.ipc.maxsockets", &maxsockets); maxsockets = imax(maxsockets, maxfiles); } SYSINIT(param, SI_SUB_TUNABLES, SI_ORDER_ANY, init_maxsockets, NULL); /* * Sysctl to get and set the maximum global sockets limit. Notify protocols * of the change so that they can update their dependent limits as required. */ static int sysctl_maxsockets(SYSCTL_HANDLER_ARGS) { int error, newmaxsockets; newmaxsockets = maxsockets; error = sysctl_handle_int(oidp, &newmaxsockets, 0, req); if (error == 0 && req->newptr) { if (newmaxsockets > maxsockets && newmaxsockets <= maxfiles) { maxsockets = newmaxsockets; EVENTHANDLER_INVOKE(maxsockets_change); } else error = EINVAL; } return (error); } SYSCTL_PROC(_kern_ipc, OID_AUTO, maxsockets, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, &maxsockets, 0, sysctl_maxsockets, "IU", "Maximum number of sockets available"); /* * Socket operation routines. These routines are called by the routines in * sys_socket.c or from a system process, and implement the semantics of * socket operations by switching out to the protocol specific routines. */ /* * Get a socket structure from our zone, and initialize it. Note that it * would probably be better to allocate socket and PCB at the same time, but * I'm not convinced that all the protocols can be easily modified to do * this. * * soalloc() returns a socket with a ref count of 0. */ static struct socket * soalloc(struct vnet *vnet) { struct socket *so; so = uma_zalloc(socket_zone, M_NOWAIT | M_ZERO); if (so == NULL) return (NULL); #ifdef MAC if (mac_socket_init(so, M_NOWAIT) != 0) { uma_zfree(socket_zone, so); return (NULL); } #endif if (khelp_init_osd(HELPER_CLASS_SOCKET, &so->osd)) { uma_zfree(socket_zone, so); return (NULL); } /* * The socket locking protocol allows to lock 2 sockets at a time, * however, the first one must be a listening socket. WITNESS lacks * a feature to change class of an existing lock, so we use DUPOK. */ mtx_init(&so->so_lock, "socket", NULL, MTX_DEF | MTX_DUPOK); SOCKBUF_LOCK_INIT(&so->so_snd, "so_snd"); SOCKBUF_LOCK_INIT(&so->so_rcv, "so_rcv"); so->so_rcv.sb_sel = &so->so_rdsel; so->so_snd.sb_sel = &so->so_wrsel; sx_init(&so->so_snd.sb_sx, "so_snd_sx"); sx_init(&so->so_rcv.sb_sx, "so_rcv_sx"); TAILQ_INIT(&so->so_snd.sb_aiojobq); TAILQ_INIT(&so->so_rcv.sb_aiojobq); TASK_INIT(&so->so_snd.sb_aiotask, 0, soaio_snd, so); TASK_INIT(&so->so_rcv.sb_aiotask, 0, soaio_rcv, so); #ifdef VIMAGE VNET_ASSERT(vnet != NULL, ("%s:%d vnet is NULL, so=%p", __func__, __LINE__, so)); so->so_vnet = vnet; #endif /* We shouldn't need the so_global_mtx */ if (hhook_run_socket(so, NULL, HHOOK_SOCKET_CREATE)) { /* Do we need more comprehensive error returns? */ uma_zfree(socket_zone, so); return (NULL); } mtx_lock(&so_global_mtx); so->so_gencnt = ++so_gencnt; ++numopensockets; #ifdef VIMAGE vnet->vnet_sockcnt++; #endif mtx_unlock(&so_global_mtx); return (so); } /* * Free the storage associated with a socket at the socket layer, tear down * locks, labels, etc. All protocol state is assumed already to have been * torn down (and possibly never set up) by the caller. */ static void sodealloc(struct socket *so) { KASSERT(so->so_count == 0, ("sodealloc(): so_count %d", so->so_count)); KASSERT(so->so_pcb == NULL, ("sodealloc(): so_pcb != NULL")); mtx_lock(&so_global_mtx); so->so_gencnt = ++so_gencnt; --numopensockets; /* Could be below, but faster here. */ #ifdef VIMAGE VNET_ASSERT(so->so_vnet != NULL, ("%s:%d so_vnet is NULL, so=%p", __func__, __LINE__, so)); so->so_vnet->vnet_sockcnt--; #endif mtx_unlock(&so_global_mtx); #ifdef MAC mac_socket_destroy(so); #endif hhook_run_socket(so, NULL, HHOOK_SOCKET_CLOSE); khelp_destroy_osd(&so->osd); if (SOLISTENING(so)) { if (so->sol_accept_filter != NULL) accept_filt_setopt(so, NULL); } else { if (so->so_rcv.sb_hiwat) (void)chgsbsize(so->so_cred->cr_uidinfo, &so->so_rcv.sb_hiwat, 0, RLIM_INFINITY); if (so->so_snd.sb_hiwat) (void)chgsbsize(so->so_cred->cr_uidinfo, &so->so_snd.sb_hiwat, 0, RLIM_INFINITY); sx_destroy(&so->so_snd.sb_sx); sx_destroy(&so->so_rcv.sb_sx); SOCKBUF_LOCK_DESTROY(&so->so_snd); SOCKBUF_LOCK_DESTROY(&so->so_rcv); } crfree(so->so_cred); mtx_destroy(&so->so_lock); uma_zfree(socket_zone, so); } /* * socreate returns a socket with a ref count of 1. The socket should be * closed with soclose(). */ int socreate(int dom, struct socket **aso, int type, int proto, struct ucred *cred, struct thread *td) { struct protosw *prp; struct socket *so; int error; if (proto) prp = pffindproto(dom, proto, type); else prp = pffindtype(dom, type); if (prp == NULL) { /* No support for domain. */ if (pffinddomain(dom) == NULL) return (EAFNOSUPPORT); /* No support for socket type. */ if (proto == 0 && type != 0) return (EPROTOTYPE); return (EPROTONOSUPPORT); } if (prp->pr_usrreqs->pru_attach == NULL || prp->pr_usrreqs->pru_attach == pru_attach_notsupp) return (EPROTONOSUPPORT); if (IN_CAPABILITY_MODE(td) && (prp->pr_flags & PR_CAPATTACH) == 0) return (ECAPMODE); if (prison_check_af(cred, prp->pr_domain->dom_family) != 0) return (EPROTONOSUPPORT); if (prp->pr_type != type) return (EPROTOTYPE); so = soalloc(CRED_TO_VNET(cred)); if (so == NULL) return (ENOBUFS); so->so_type = type; so->so_cred = crhold(cred); if ((prp->pr_domain->dom_family == PF_INET) || (prp->pr_domain->dom_family == PF_INET6) || (prp->pr_domain->dom_family == PF_ROUTE)) so->so_fibnum = td->td_proc->p_fibnum; else so->so_fibnum = 0; so->so_proto = prp; #ifdef MAC mac_socket_create(cred, so); #endif knlist_init(&so->so_rdsel.si_note, so, so_rdknl_lock, so_rdknl_unlock, so_rdknl_assert_lock); knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock, so_wrknl_assert_lock); /* * Auto-sizing of socket buffers is managed by the protocols and * the appropriate flags must be set in the pru_attach function. */ CURVNET_SET(so->so_vnet); error = (*prp->pr_usrreqs->pru_attach)(so, proto, td); CURVNET_RESTORE(); if (error) { sodealloc(so); return (error); } soref(so); *aso = so; return (0); } #ifdef REGRESSION static int regression_sonewconn_earlytest = 1; SYSCTL_INT(_regression, OID_AUTO, sonewconn_earlytest, CTLFLAG_RW, ®ression_sonewconn_earlytest, 0, "Perform early sonewconn limit test"); #endif static struct timeval overinterval = { 60, 0 }; SYSCTL_TIMEVAL_SEC(_kern_ipc, OID_AUTO, sooverinterval, CTLFLAG_RW, &overinterval, "Delay in seconds between warnings for listen socket overflows"); /* * When an attempt at a new connection is noted on a socket which accepts * connections, sonewconn is called. If the connection is possible (subject * to space constraints, etc.) then we allocate a new structure, properly * linked into the data structure of the original socket, and return this. * Connstatus may be 0, or SS_ISCONFIRMING, or SS_ISCONNECTED. * * Note: the ref count on the socket is 0 on return. */ struct socket * sonewconn(struct socket *head, int connstatus) { struct sbuf descrsb; struct socket *so; int len, overcount; u_int qlen; const char localprefix[] = "local:"; char descrbuf[SUNPATHLEN + sizeof(localprefix)]; #if defined(INET6) char addrbuf[INET6_ADDRSTRLEN]; #elif defined(INET) char addrbuf[INET_ADDRSTRLEN]; #endif bool dolog, over; SOLISTEN_LOCK(head); over = (head->sol_qlen > 3 * head->sol_qlimit / 2); #ifdef REGRESSION if (regression_sonewconn_earlytest && over) { #else if (over) { #endif head->sol_overcount++; dolog = !!ratecheck(&head->sol_lastover, &overinterval); /* * If we're going to log, copy the overflow count and queue * length from the listen socket before dropping the lock. * Also, reset the overflow count. */ if (dolog) { overcount = head->sol_overcount; head->sol_overcount = 0; qlen = head->sol_qlen; } SOLISTEN_UNLOCK(head); if (dolog) { /* * Try to print something descriptive about the * socket for the error message. */ sbuf_new(&descrsb, descrbuf, sizeof(descrbuf), SBUF_FIXEDLEN); switch (head->so_proto->pr_domain->dom_family) { #if defined(INET) || defined(INET6) #ifdef INET case AF_INET: #endif #ifdef INET6 case AF_INET6: if (head->so_proto->pr_domain->dom_family == AF_INET6 || (sotoinpcb(head)->inp_inc.inc_flags & INC_ISIPV6)) { ip6_sprintf(addrbuf, &sotoinpcb(head)->inp_inc.inc6_laddr); sbuf_printf(&descrsb, "[%s]", addrbuf); } else #endif { #ifdef INET inet_ntoa_r( sotoinpcb(head)->inp_inc.inc_laddr, addrbuf); sbuf_cat(&descrsb, addrbuf); #endif } sbuf_printf(&descrsb, ":%hu (proto %u)", ntohs(sotoinpcb(head)->inp_inc.inc_lport), head->so_proto->pr_protocol); break; #endif /* INET || INET6 */ case AF_UNIX: sbuf_cat(&descrsb, localprefix); if (sotounpcb(head)->unp_addr != NULL) len = sotounpcb(head)->unp_addr->sun_len - offsetof(struct sockaddr_un, sun_path); else len = 0; if (len > 0) sbuf_bcat(&descrsb, sotounpcb(head)->unp_addr->sun_path, len); else sbuf_cat(&descrsb, "(unknown)"); break; } /* * If we can't print something more specific, at least * print the domain name. */ if (sbuf_finish(&descrsb) != 0 || sbuf_len(&descrsb) <= 0) { sbuf_clear(&descrsb); sbuf_cat(&descrsb, head->so_proto->pr_domain->dom_name ?: "unknown"); sbuf_finish(&descrsb); } KASSERT(sbuf_len(&descrsb) > 0, ("%s: sbuf creation failed", __func__)); log(LOG_DEBUG, "%s: pcb %p (%s): Listen queue overflow: " "%i already in queue awaiting acceptance " "(%d occurrences)\n", __func__, head->so_pcb, sbuf_data(&descrsb), qlen, overcount); sbuf_delete(&descrsb); overcount = 0; } return (NULL); } SOLISTEN_UNLOCK(head); VNET_ASSERT(head->so_vnet != NULL, ("%s: so %p vnet is NULL", __func__, head)); so = soalloc(head->so_vnet); if (so == NULL) { log(LOG_DEBUG, "%s: pcb %p: New socket allocation failure: " "limit reached or out of memory\n", __func__, head->so_pcb); return (NULL); } so->so_listen = head; so->so_type = head->so_type; so->so_options = head->so_options & ~SO_ACCEPTCONN; so->so_linger = head->so_linger; so->so_state = head->so_state | SS_NOFDREF; so->so_fibnum = head->so_fibnum; so->so_proto = head->so_proto; so->so_cred = crhold(head->so_cred); #ifdef MAC mac_socket_newconn(head, so); #endif knlist_init(&so->so_rdsel.si_note, so, so_rdknl_lock, so_rdknl_unlock, so_rdknl_assert_lock); knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock, so_wrknl_assert_lock); VNET_SO_ASSERT(head); if (soreserve(so, head->sol_sbsnd_hiwat, head->sol_sbrcv_hiwat)) { sodealloc(so); log(LOG_DEBUG, "%s: pcb %p: soreserve() failed\n", __func__, head->so_pcb); return (NULL); } if ((*so->so_proto->pr_usrreqs->pru_attach)(so, 0, NULL)) { sodealloc(so); log(LOG_DEBUG, "%s: pcb %p: pru_attach() failed\n", __func__, head->so_pcb); return (NULL); } so->so_rcv.sb_lowat = head->sol_sbrcv_lowat; so->so_snd.sb_lowat = head->sol_sbsnd_lowat; so->so_rcv.sb_timeo = head->sol_sbrcv_timeo; so->so_snd.sb_timeo = head->sol_sbsnd_timeo; so->so_rcv.sb_flags |= head->sol_sbrcv_flags & SB_AUTOSIZE; so->so_snd.sb_flags |= head->sol_sbsnd_flags & SB_AUTOSIZE; SOLISTEN_LOCK(head); if (head->sol_accept_filter != NULL) connstatus = 0; so->so_state |= connstatus; soref(head); /* A socket on (in)complete queue refs head. */ if (connstatus) { TAILQ_INSERT_TAIL(&head->sol_comp, so, so_list); so->so_qstate = SQ_COMP; head->sol_qlen++; solisten_wakeup(head); /* unlocks */ } else { /* * Keep removing sockets from the head until there's room for * us to insert on the tail. In pre-locking revisions, this * was a simple if(), but as we could be racing with other * threads and soabort() requires dropping locks, we must * loop waiting for the condition to be true. */ while (head->sol_incqlen > head->sol_qlimit) { struct socket *sp; sp = TAILQ_FIRST(&head->sol_incomp); TAILQ_REMOVE(&head->sol_incomp, sp, so_list); head->sol_incqlen--; SOCK_LOCK(sp); sp->so_qstate = SQ_NONE; sp->so_listen = NULL; SOCK_UNLOCK(sp); sorele(head); /* does SOLISTEN_UNLOCK, head stays */ soabort(sp); SOLISTEN_LOCK(head); } TAILQ_INSERT_TAIL(&head->sol_incomp, so, so_list); so->so_qstate = SQ_INCOMP; head->sol_incqlen++; SOLISTEN_UNLOCK(head); } return (so); } #if defined(SCTP) || defined(SCTP_SUPPORT) /* * Socket part of sctp_peeloff(). Detach a new socket from an * association. The new socket is returned with a reference. */ struct socket * sopeeloff(struct socket *head) { struct socket *so; VNET_ASSERT(head->so_vnet != NULL, ("%s:%d so_vnet is NULL, head=%p", __func__, __LINE__, head)); so = soalloc(head->so_vnet); if (so == NULL) { log(LOG_DEBUG, "%s: pcb %p: New socket allocation failure: " "limit reached or out of memory\n", __func__, head->so_pcb); return (NULL); } so->so_type = head->so_type; so->so_options = head->so_options; so->so_linger = head->so_linger; so->so_state = (head->so_state & SS_NBIO) | SS_ISCONNECTED; so->so_fibnum = head->so_fibnum; so->so_proto = head->so_proto; so->so_cred = crhold(head->so_cred); #ifdef MAC mac_socket_newconn(head, so); #endif knlist_init(&so->so_rdsel.si_note, so, so_rdknl_lock, so_rdknl_unlock, so_rdknl_assert_lock); knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock, so_wrknl_assert_lock); VNET_SO_ASSERT(head); if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat)) { sodealloc(so); log(LOG_DEBUG, "%s: pcb %p: soreserve() failed\n", __func__, head->so_pcb); return (NULL); } if ((*so->so_proto->pr_usrreqs->pru_attach)(so, 0, NULL)) { sodealloc(so); log(LOG_DEBUG, "%s: pcb %p: pru_attach() failed\n", __func__, head->so_pcb); return (NULL); } so->so_rcv.sb_lowat = head->so_rcv.sb_lowat; so->so_snd.sb_lowat = head->so_snd.sb_lowat; so->so_rcv.sb_timeo = head->so_rcv.sb_timeo; so->so_snd.sb_timeo = head->so_snd.sb_timeo; so->so_rcv.sb_flags |= head->so_rcv.sb_flags & SB_AUTOSIZE; so->so_snd.sb_flags |= head->so_snd.sb_flags & SB_AUTOSIZE; soref(so); return (so); } #endif /* SCTP */ int sobind(struct socket *so, struct sockaddr *nam, struct thread *td) { int error; CURVNET_SET(so->so_vnet); error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, td); CURVNET_RESTORE(); return (error); } int sobindat(int fd, struct socket *so, struct sockaddr *nam, struct thread *td) { int error; CURVNET_SET(so->so_vnet); error = (*so->so_proto->pr_usrreqs->pru_bindat)(fd, so, nam, td); CURVNET_RESTORE(); return (error); } /* * solisten() transitions a socket from a non-listening state to a listening * state, but can also be used to update the listen queue depth on an * existing listen socket. The protocol will call back into the sockets * layer using solisten_proto_check() and solisten_proto() to check and set * socket-layer listen state. Call backs are used so that the protocol can * acquire both protocol and socket layer locks in whatever order is required * by the protocol. * * Protocol implementors are advised to hold the socket lock across the * socket-layer test and set to avoid races at the socket layer. */ int solisten(struct socket *so, int backlog, struct thread *td) { int error; CURVNET_SET(so->so_vnet); error = (*so->so_proto->pr_usrreqs->pru_listen)(so, backlog, td); CURVNET_RESTORE(); return (error); } int solisten_proto_check(struct socket *so) { SOCK_LOCK_ASSERT(so); if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) return (EINVAL); return (0); } void solisten_proto(struct socket *so, int backlog) { int sbrcv_lowat, sbsnd_lowat; u_int sbrcv_hiwat, sbsnd_hiwat; short sbrcv_flags, sbsnd_flags; sbintime_t sbrcv_timeo, sbsnd_timeo; SOCK_LOCK_ASSERT(so); if (SOLISTENING(so)) goto listening; /* * Change this socket to listening state. */ sbrcv_lowat = so->so_rcv.sb_lowat; sbsnd_lowat = so->so_snd.sb_lowat; sbrcv_hiwat = so->so_rcv.sb_hiwat; sbsnd_hiwat = so->so_snd.sb_hiwat; sbrcv_flags = so->so_rcv.sb_flags; sbsnd_flags = so->so_snd.sb_flags; sbrcv_timeo = so->so_rcv.sb_timeo; sbsnd_timeo = so->so_snd.sb_timeo; sbdestroy(&so->so_snd, so); sbdestroy(&so->so_rcv, so); sx_destroy(&so->so_snd.sb_sx); sx_destroy(&so->so_rcv.sb_sx); SOCKBUF_LOCK_DESTROY(&so->so_snd); SOCKBUF_LOCK_DESTROY(&so->so_rcv); #ifdef INVARIANTS bzero(&so->so_rcv, sizeof(struct socket) - offsetof(struct socket, so_rcv)); #endif so->sol_sbrcv_lowat = sbrcv_lowat; so->sol_sbsnd_lowat = sbsnd_lowat; so->sol_sbrcv_hiwat = sbrcv_hiwat; so->sol_sbsnd_hiwat = sbsnd_hiwat; so->sol_sbrcv_flags = sbrcv_flags; so->sol_sbsnd_flags = sbsnd_flags; so->sol_sbrcv_timeo = sbrcv_timeo; so->sol_sbsnd_timeo = sbsnd_timeo; so->sol_qlen = so->sol_incqlen = 0; TAILQ_INIT(&so->sol_incomp); TAILQ_INIT(&so->sol_comp); so->sol_accept_filter = NULL; so->sol_accept_filter_arg = NULL; so->sol_accept_filter_str = NULL; so->sol_upcall = NULL; so->sol_upcallarg = NULL; so->so_options |= SO_ACCEPTCONN; listening: if (backlog < 0 || backlog > somaxconn) backlog = somaxconn; so->sol_qlimit = backlog; } /* * Wakeup listeners/subsystems once we have a complete connection. * Enters with lock, returns unlocked. */ void solisten_wakeup(struct socket *sol) { if (sol->sol_upcall != NULL) (void )sol->sol_upcall(sol, sol->sol_upcallarg, M_NOWAIT); else { selwakeuppri(&sol->so_rdsel, PSOCK); KNOTE_LOCKED(&sol->so_rdsel.si_note, 0); } SOLISTEN_UNLOCK(sol); wakeup_one(&sol->sol_comp); if ((sol->so_state & SS_ASYNC) && sol->so_sigio != NULL) pgsigio(&sol->so_sigio, SIGIO, 0); } /* * Return single connection off a listening socket queue. Main consumer of * the function is kern_accept4(). Some modules, that do their own accept * management also use the function. * * Listening socket must be locked on entry and is returned unlocked on * return. * The flags argument is set of accept4(2) flags and ACCEPT4_INHERIT. */ int solisten_dequeue(struct socket *head, struct socket **ret, int flags) { struct socket *so; int error; SOLISTEN_LOCK_ASSERT(head); while (!(head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->sol_comp) && head->so_error == 0) { error = msleep(&head->sol_comp, &head->so_lock, PSOCK | PCATCH, "accept", 0); if (error != 0) { SOLISTEN_UNLOCK(head); return (error); } } if (head->so_error) { error = head->so_error; head->so_error = 0; } else if ((head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->sol_comp)) error = EWOULDBLOCK; else error = 0; if (error) { SOLISTEN_UNLOCK(head); return (error); } so = TAILQ_FIRST(&head->sol_comp); SOCK_LOCK(so); KASSERT(so->so_qstate == SQ_COMP, ("%s: so %p not SQ_COMP", __func__, so)); soref(so); head->sol_qlen--; so->so_qstate = SQ_NONE; so->so_listen = NULL; TAILQ_REMOVE(&head->sol_comp, so, so_list); if (flags & ACCEPT4_INHERIT) so->so_state |= (head->so_state & SS_NBIO); else so->so_state |= (flags & SOCK_NONBLOCK) ? SS_NBIO : 0; SOCK_UNLOCK(so); sorele(head); *ret = so; return (0); } /* * Evaluate the reference count and named references on a socket; if no * references remain, free it. This should be called whenever a reference is * released, such as in sorele(), but also when named reference flags are * cleared in socket or protocol code. * * sofree() will free the socket if: * * - There are no outstanding file descriptor references or related consumers * (so_count == 0). * * - The socket has been closed by user space, if ever open (SS_NOFDREF). * * - The protocol does not have an outstanding strong reference on the socket * (SS_PROTOREF). * * - The socket is not in a completed connection queue, so a process has been * notified that it is present. If it is removed, the user process may * block in accept() despite select() saying the socket was ready. */ void sofree(struct socket *so) { struct protosw *pr = so->so_proto; SOCK_LOCK_ASSERT(so); if ((so->so_state & SS_NOFDREF) == 0 || so->so_count != 0 || (so->so_state & SS_PROTOREF) || (so->so_qstate == SQ_COMP)) { SOCK_UNLOCK(so); return; } if (!SOLISTENING(so) && so->so_qstate == SQ_INCOMP) { struct socket *sol; sol = so->so_listen; KASSERT(sol, ("%s: so %p on incomp of NULL", __func__, so)); /* * To solve race between close of a listening socket and * a socket on its incomplete queue, we need to lock both. * The order is first listening socket, then regular. * Since we don't have SS_NOFDREF neither SS_PROTOREF, this * function and the listening socket are the only pointers * to so. To preserve so and sol, we reference both and then * relock. * After relock the socket may not move to so_comp since it * doesn't have PCB already, but it may be removed from * so_incomp. If that happens, we share responsiblity on * freeing the socket, but soclose() has already removed * it from queue. */ soref(sol); soref(so); SOCK_UNLOCK(so); SOLISTEN_LOCK(sol); SOCK_LOCK(so); if (so->so_qstate == SQ_INCOMP) { KASSERT(so->so_listen == sol, ("%s: so %p migrated out of sol %p", __func__, so, sol)); TAILQ_REMOVE(&sol->sol_incomp, so, so_list); sol->sol_incqlen--; /* This is guarenteed not to be the last. */ refcount_release(&sol->so_count); so->so_qstate = SQ_NONE; so->so_listen = NULL; } else KASSERT(so->so_listen == NULL, ("%s: so %p not on (in)comp with so_listen", __func__, so)); sorele(sol); KASSERT(so->so_count == 1, ("%s: so %p count %u", __func__, so, so->so_count)); so->so_count = 0; } if (SOLISTENING(so)) so->so_error = ECONNABORTED; SOCK_UNLOCK(so); if (so->so_dtor != NULL) so->so_dtor(so); VNET_SO_ASSERT(so); if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL) (*pr->pr_domain->dom_dispose)(so); if (pr->pr_usrreqs->pru_detach != NULL) (*pr->pr_usrreqs->pru_detach)(so); /* * From this point on, we assume that no other references to this * socket exist anywhere else in the stack. Therefore, no locks need * to be acquired or held. * * We used to do a lot of socket buffer and socket locking here, as * well as invoke sorflush() and perform wakeups. The direct call to * dom_dispose() and sbdestroy() are an inlining of what was * necessary from sorflush(). * * Notice that the socket buffer and kqueue state are torn down * before calling pru_detach. This means that protocols shold not * assume they can perform socket wakeups, etc, in their detach code. */ if (!SOLISTENING(so)) { sbdestroy(&so->so_snd, so); sbdestroy(&so->so_rcv, so); } seldrain(&so->so_rdsel); seldrain(&so->so_wrsel); knlist_destroy(&so->so_rdsel.si_note); knlist_destroy(&so->so_wrsel.si_note); sodealloc(so); } /* * Close a socket on last file table reference removal. Initiate disconnect * if connected. Free socket when disconnect complete. * * This function will sorele() the socket. Note that soclose() may be called * prior to the ref count reaching zero. The actual socket structure will * not be freed until the ref count reaches zero. */ int soclose(struct socket *so) { struct accept_queue lqueue; - bool listening; int error = 0; KASSERT(!(so->so_state & SS_NOFDREF), ("soclose: SS_NOFDREF on enter")); CURVNET_SET(so->so_vnet); funsetown(&so->so_sigio); if (so->so_state & SS_ISCONNECTED) { if ((so->so_state & SS_ISDISCONNECTING) == 0) { error = sodisconnect(so); if (error) { if (error == ENOTCONN) error = 0; goto drop; } } if ((so->so_options & SO_LINGER) != 0 && so->so_linger != 0) { if ((so->so_state & SS_ISDISCONNECTING) && (so->so_state & SS_NBIO)) goto drop; while (so->so_state & SS_ISCONNECTED) { error = tsleep(&so->so_timeo, PSOCK | PCATCH, "soclos", so->so_linger * hz); if (error) break; } } } drop: if (so->so_proto->pr_usrreqs->pru_close != NULL) (*so->so_proto->pr_usrreqs->pru_close)(so); SOCK_LOCK(so); - if ((listening = (so->so_options & SO_ACCEPTCONN))) { + if (SOLISTENING(so)) { struct socket *sp; TAILQ_INIT(&lqueue); TAILQ_SWAP(&lqueue, &so->sol_incomp, socket, so_list); TAILQ_CONCAT(&lqueue, &so->sol_comp, so_list); so->sol_qlen = so->sol_incqlen = 0; TAILQ_FOREACH(sp, &lqueue, so_list) { SOCK_LOCK(sp); sp->so_qstate = SQ_NONE; sp->so_listen = NULL; SOCK_UNLOCK(sp); /* Guaranteed not to be the last. */ refcount_release(&so->so_count); } } KASSERT((so->so_state & SS_NOFDREF) == 0, ("soclose: NOFDREF")); so->so_state |= SS_NOFDREF; sorele(so); - if (listening) { + if (SOLISTENING(so)) { struct socket *sp, *tsp; TAILQ_FOREACH_SAFE(sp, &lqueue, so_list, tsp) { SOCK_LOCK(sp); if (sp->so_count == 0) { SOCK_UNLOCK(sp); soabort(sp); } else /* sp is now in sofree() */ SOCK_UNLOCK(sp); } } CURVNET_RESTORE(); return (error); } /* * soabort() is used to abruptly tear down a connection, such as when a * resource limit is reached (listen queue depth exceeded), or if a listen * socket is closed while there are sockets waiting to be accepted. * * This interface is tricky, because it is called on an unreferenced socket, * and must be called only by a thread that has actually removed the socket * from the listen queue it was on, or races with other threads are risked. * * This interface will call into the protocol code, so must not be called * with any socket locks held. Protocols do call it while holding their own * recursible protocol mutexes, but this is something that should be subject * to review in the future. */ void soabort(struct socket *so) { /* * In as much as is possible, assert that no references to this * socket are held. This is not quite the same as asserting that the * current thread is responsible for arranging for no references, but * is as close as we can get for now. */ KASSERT(so->so_count == 0, ("soabort: so_count")); KASSERT((so->so_state & SS_PROTOREF) == 0, ("soabort: SS_PROTOREF")); KASSERT(so->so_state & SS_NOFDREF, ("soabort: !SS_NOFDREF")); VNET_SO_ASSERT(so); if (so->so_proto->pr_usrreqs->pru_abort != NULL) (*so->so_proto->pr_usrreqs->pru_abort)(so); SOCK_LOCK(so); sofree(so); } int soaccept(struct socket *so, struct sockaddr **nam) { int error; SOCK_LOCK(so); KASSERT((so->so_state & SS_NOFDREF) != 0, ("soaccept: !NOFDREF")); so->so_state &= ~SS_NOFDREF; SOCK_UNLOCK(so); CURVNET_SET(so->so_vnet); error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam); CURVNET_RESTORE(); return (error); } int soconnect(struct socket *so, struct sockaddr *nam, struct thread *td) { return (soconnectat(AT_FDCWD, so, nam, td)); } int soconnectat(int fd, struct socket *so, struct sockaddr *nam, struct thread *td) { int error; - if (so->so_options & SO_ACCEPTCONN) + /* XXXMJ racy */ + if (SOLISTENING(so)) return (EOPNOTSUPP); CURVNET_SET(so->so_vnet); /* * If protocol is connection-based, can only connect once. * Otherwise, if connected, try to disconnect first. This allows * user to disconnect by connecting to, e.g., a null address. */ if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) && ((so->so_proto->pr_flags & PR_CONNREQUIRED) || (error = sodisconnect(so)))) { error = EISCONN; } else { /* * Prevent accumulated error from previous connection from * biting us. */ so->so_error = 0; if (fd == AT_FDCWD) { error = (*so->so_proto->pr_usrreqs->pru_connect)(so, nam, td); } else { error = (*so->so_proto->pr_usrreqs->pru_connectat)(fd, so, nam, td); } } CURVNET_RESTORE(); return (error); } int soconnect2(struct socket *so1, struct socket *so2) { int error; CURVNET_SET(so1->so_vnet); error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2); CURVNET_RESTORE(); return (error); } int sodisconnect(struct socket *so) { int error; if ((so->so_state & SS_ISCONNECTED) == 0) return (ENOTCONN); if (so->so_state & SS_ISDISCONNECTING) return (EALREADY); VNET_SO_ASSERT(so); error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so); return (error); } #define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT) int sosend_dgram(struct socket *so, struct sockaddr *addr, struct uio *uio, struct mbuf *top, struct mbuf *control, int flags, struct thread *td) { long space; ssize_t resid; int clen = 0, error, dontroute; KASSERT(so->so_type == SOCK_DGRAM, ("sosend_dgram: !SOCK_DGRAM")); KASSERT(so->so_proto->pr_flags & PR_ATOMIC, ("sosend_dgram: !PR_ATOMIC")); if (uio != NULL) resid = uio->uio_resid; else resid = top->m_pkthdr.len; /* * In theory resid should be unsigned. However, space must be * signed, as it might be less than 0 if we over-committed, and we * must use a signed comparison of space and resid. On the other * hand, a negative resid causes us to loop sending 0-length * segments to the protocol. */ if (resid < 0) { error = EINVAL; goto out; } dontroute = (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0; if (td != NULL) td->td_ru.ru_msgsnd++; if (control != NULL) clen = control->m_len; SOCKBUF_LOCK(&so->so_snd); if (so->so_snd.sb_state & SBS_CANTSENDMORE) { SOCKBUF_UNLOCK(&so->so_snd); error = EPIPE; goto out; } if (so->so_error) { error = so->so_error; so->so_error = 0; SOCKBUF_UNLOCK(&so->so_snd); goto out; } if ((so->so_state & SS_ISCONNECTED) == 0) { /* * `sendto' and `sendmsg' is allowed on a connection-based * socket if it supports implied connect. Return ENOTCONN if * not connected and no address is supplied. */ if ((so->so_proto->pr_flags & PR_CONNREQUIRED) && (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) { if ((so->so_state & SS_ISCONFIRMING) == 0 && !(resid == 0 && clen != 0)) { SOCKBUF_UNLOCK(&so->so_snd); error = ENOTCONN; goto out; } } else if (addr == NULL) { if (so->so_proto->pr_flags & PR_CONNREQUIRED) error = ENOTCONN; else error = EDESTADDRREQ; SOCKBUF_UNLOCK(&so->so_snd); goto out; } } /* * Do we need MSG_OOB support in SOCK_DGRAM? Signs here may be a * problem and need fixing. */ space = sbspace(&so->so_snd); if (flags & MSG_OOB) space += 1024; space -= clen; SOCKBUF_UNLOCK(&so->so_snd); if (resid > space) { error = EMSGSIZE; goto out; } if (uio == NULL) { resid = 0; if (flags & MSG_EOR) top->m_flags |= M_EOR; } else { /* * Copy the data from userland into a mbuf chain. * If no data is to be copied in, a single empty mbuf * is returned. */ top = m_uiotombuf(uio, M_WAITOK, space, max_hdr, (M_PKTHDR | ((flags & MSG_EOR) ? M_EOR : 0))); if (top == NULL) { error = EFAULT; /* only possible error */ goto out; } space -= resid - uio->uio_resid; resid = uio->uio_resid; } KASSERT(resid == 0, ("sosend_dgram: resid != 0")); /* * XXXRW: Frobbing SO_DONTROUTE here is even worse without sblock * than with. */ if (dontroute) { SOCK_LOCK(so); so->so_options |= SO_DONTROUTE; SOCK_UNLOCK(so); } /* * XXX all the SBS_CANTSENDMORE checks previously done could be out * of date. We could have received a reset packet in an interrupt or * maybe we slept while doing page faults in uiomove() etc. We could * probably recheck again inside the locking protection here, but * there are probably other places that this also happens. We must * rethink this. */ VNET_SO_ASSERT(so); error = (*so->so_proto->pr_usrreqs->pru_send)(so, (flags & MSG_OOB) ? PRUS_OOB : /* * If the user set MSG_EOF, the protocol understands this flag and * nothing left to send then use PRU_SEND_EOF instead of PRU_SEND. */ ((flags & MSG_EOF) && (so->so_proto->pr_flags & PR_IMPLOPCL) && (resid <= 0)) ? PRUS_EOF : /* If there is more to send set PRUS_MORETOCOME */ (flags & MSG_MORETOCOME) || (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0, top, addr, control, td); if (dontroute) { SOCK_LOCK(so); so->so_options &= ~SO_DONTROUTE; SOCK_UNLOCK(so); } clen = 0; control = NULL; top = NULL; out: if (top != NULL) m_freem(top); if (control != NULL) m_freem(control); return (error); } /* * Send on a socket. If send must go all at once and message is larger than * send buffering, then hard error. Lock against other senders. If must go * all at once and not enough room now, then inform user that this would * block and do nothing. Otherwise, if nonblocking, send as much as * possible. The data to be sent is described by "uio" if nonzero, otherwise * by the mbuf chain "top" (which must be null if uio is not). Data provided * in mbuf chain must be small enough to send all at once. * * Returns nonzero on error, timeout or signal; callers must check for short * counts if EINTR/ERESTART are returned. Data and control buffers are freed * on return. */ int sosend_generic(struct socket *so, struct sockaddr *addr, struct uio *uio, struct mbuf *top, struct mbuf *control, int flags, struct thread *td) { long space; ssize_t resid; int clen = 0, error, dontroute; int atomic = sosendallatonce(so) || top; int pru_flag; #ifdef KERN_TLS struct ktls_session *tls; int tls_enq_cnt, tls_pruflag; uint8_t tls_rtype; tls = NULL; tls_rtype = TLS_RLTYPE_APP; #endif if (uio != NULL) resid = uio->uio_resid; else if ((top->m_flags & M_PKTHDR) != 0) resid = top->m_pkthdr.len; else resid = m_length(top, NULL); /* * In theory resid should be unsigned. However, space must be * signed, as it might be less than 0 if we over-committed, and we * must use a signed comparison of space and resid. On the other * hand, a negative resid causes us to loop sending 0-length * segments to the protocol. * * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM * type sockets since that's an error. */ if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) { error = EINVAL; goto out; } dontroute = (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 && (so->so_proto->pr_flags & PR_ATOMIC); if (td != NULL) td->td_ru.ru_msgsnd++; if (control != NULL) clen = control->m_len; error = sblock(&so->so_snd, SBLOCKWAIT(flags)); if (error) goto out; #ifdef KERN_TLS tls_pruflag = 0; tls = ktls_hold(so->so_snd.sb_tls_info); if (tls != NULL) { if (tls->mode == TCP_TLS_MODE_SW) tls_pruflag = PRUS_NOTREADY; if (control != NULL) { struct cmsghdr *cm = mtod(control, struct cmsghdr *); if (clen >= sizeof(*cm) && cm->cmsg_type == TLS_SET_RECORD_TYPE) { tls_rtype = *((uint8_t *)CMSG_DATA(cm)); clen = 0; m_freem(control); control = NULL; atomic = 1; } } } #endif restart: do { SOCKBUF_LOCK(&so->so_snd); if (so->so_snd.sb_state & SBS_CANTSENDMORE) { SOCKBUF_UNLOCK(&so->so_snd); error = EPIPE; goto release; } if (so->so_error) { error = so->so_error; so->so_error = 0; SOCKBUF_UNLOCK(&so->so_snd); goto release; } if ((so->so_state & SS_ISCONNECTED) == 0) { /* * `sendto' and `sendmsg' is allowed on a connection- * based socket if it supports implied connect. * Return ENOTCONN if not connected and no address is * supplied. */ if ((so->so_proto->pr_flags & PR_CONNREQUIRED) && (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) { if ((so->so_state & SS_ISCONFIRMING) == 0 && !(resid == 0 && clen != 0)) { SOCKBUF_UNLOCK(&so->so_snd); error = ENOTCONN; goto release; } } else if (addr == NULL) { SOCKBUF_UNLOCK(&so->so_snd); if (so->so_proto->pr_flags & PR_CONNREQUIRED) error = ENOTCONN; else error = EDESTADDRREQ; goto release; } } space = sbspace(&so->so_snd); if (flags & MSG_OOB) space += 1024; if ((atomic && resid > so->so_snd.sb_hiwat) || clen > so->so_snd.sb_hiwat) { SOCKBUF_UNLOCK(&so->so_snd); error = EMSGSIZE; goto release; } if (space < resid + clen && (atomic || space < so->so_snd.sb_lowat || space < clen)) { if ((so->so_state & SS_NBIO) || (flags & (MSG_NBIO | MSG_DONTWAIT)) != 0) { SOCKBUF_UNLOCK(&so->so_snd); error = EWOULDBLOCK; goto release; } error = sbwait(&so->so_snd); SOCKBUF_UNLOCK(&so->so_snd); if (error) goto release; goto restart; } SOCKBUF_UNLOCK(&so->so_snd); space -= clen; do { if (uio == NULL) { resid = 0; if (flags & MSG_EOR) top->m_flags |= M_EOR; #ifdef KERN_TLS if (tls != NULL) { ktls_frame(top, tls, &tls_enq_cnt, tls_rtype); tls_rtype = TLS_RLTYPE_APP; } #endif } else { /* * Copy the data from userland into a mbuf * chain. If resid is 0, which can happen * only if we have control to send, then * a single empty mbuf is returned. This * is a workaround to prevent protocol send * methods to panic. */ #ifdef KERN_TLS if (tls != NULL) { top = m_uiotombuf(uio, M_WAITOK, space, tls->params.max_frame_len, M_EXTPG | ((flags & MSG_EOR) ? M_EOR : 0)); if (top != NULL) { ktls_frame(top, tls, &tls_enq_cnt, tls_rtype); } tls_rtype = TLS_RLTYPE_APP; } else #endif top = m_uiotombuf(uio, M_WAITOK, space, (atomic ? max_hdr : 0), (atomic ? M_PKTHDR : 0) | ((flags & MSG_EOR) ? M_EOR : 0)); if (top == NULL) { error = EFAULT; /* only possible error */ goto release; } space -= resid - uio->uio_resid; resid = uio->uio_resid; } if (dontroute) { SOCK_LOCK(so); so->so_options |= SO_DONTROUTE; SOCK_UNLOCK(so); } /* * XXX all the SBS_CANTSENDMORE checks previously * done could be out of date. We could have received * a reset packet in an interrupt or maybe we slept * while doing page faults in uiomove() etc. We * could probably recheck again inside the locking * protection here, but there are probably other * places that this also happens. We must rethink * this. */ VNET_SO_ASSERT(so); pru_flag = (flags & MSG_OOB) ? PRUS_OOB : /* * If the user set MSG_EOF, the protocol understands * this flag and nothing left to send then use * PRU_SEND_EOF instead of PRU_SEND. */ ((flags & MSG_EOF) && (so->so_proto->pr_flags & PR_IMPLOPCL) && (resid <= 0)) ? PRUS_EOF : /* If there is more to send set PRUS_MORETOCOME. */ (flags & MSG_MORETOCOME) || (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0; #ifdef KERN_TLS pru_flag |= tls_pruflag; #endif error = (*so->so_proto->pr_usrreqs->pru_send)(so, pru_flag, top, addr, control, td); if (dontroute) { SOCK_LOCK(so); so->so_options &= ~SO_DONTROUTE; SOCK_UNLOCK(so); } #ifdef KERN_TLS if (tls != NULL && tls->mode == TCP_TLS_MODE_SW) { if (error != 0) { m_freem(top); top = NULL; } else { soref(so); ktls_enqueue(top, so, tls_enq_cnt); } } #endif clen = 0; control = NULL; top = NULL; if (error) goto release; } while (resid && space > 0); } while (resid); release: sbunlock(&so->so_snd); out: #ifdef KERN_TLS if (tls != NULL) ktls_free(tls); #endif if (top != NULL) m_freem(top); if (control != NULL) m_freem(control); return (error); } int sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, struct mbuf *top, struct mbuf *control, int flags, struct thread *td) { int error; CURVNET_SET(so->so_vnet); if (!SOLISTENING(so)) error = so->so_proto->pr_usrreqs->pru_sosend(so, addr, uio, top, control, flags, td); else { m_freem(top); m_freem(control); error = ENOTCONN; } CURVNET_RESTORE(); return (error); } /* * The part of soreceive() that implements reading non-inline out-of-band * data from a socket. For more complete comments, see soreceive(), from * which this code originated. * * Note that soreceive_rcvoob(), unlike the remainder of soreceive(), is * unable to return an mbuf chain to the caller. */ static int soreceive_rcvoob(struct socket *so, struct uio *uio, int flags) { struct protosw *pr = so->so_proto; struct mbuf *m; int error; KASSERT(flags & MSG_OOB, ("soreceive_rcvoob: (flags & MSG_OOB) == 0")); VNET_SO_ASSERT(so); m = m_get(M_WAITOK, MT_DATA); error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK); if (error) goto bad; do { error = uiomove(mtod(m, void *), (int) min(uio->uio_resid, m->m_len), uio); m = m_free(m); } while (uio->uio_resid && error == 0 && m); bad: if (m != NULL) m_freem(m); return (error); } /* * Following replacement or removal of the first mbuf on the first mbuf chain * of a socket buffer, push necessary state changes back into the socket * buffer so that other consumers see the values consistently. 'nextrecord' * is the callers locally stored value of the original value of * sb->sb_mb->m_nextpkt which must be restored when the lead mbuf changes. * NOTE: 'nextrecord' may be NULL. */ static __inline void sockbuf_pushsync(struct sockbuf *sb, struct mbuf *nextrecord) { SOCKBUF_LOCK_ASSERT(sb); /* * First, update for the new value of nextrecord. If necessary, make * it the first record. */ if (sb->sb_mb != NULL) sb->sb_mb->m_nextpkt = nextrecord; else sb->sb_mb = nextrecord; /* * Now update any dependent socket buffer fields to reflect the new * state. This is an expanded inline of SB_EMPTY_FIXUP(), with the * addition of a second clause that takes care of the case where * sb_mb has been updated, but remains the last record. */ if (sb->sb_mb == NULL) { sb->sb_mbtail = NULL; sb->sb_lastrecord = NULL; } else if (sb->sb_mb->m_nextpkt == NULL) sb->sb_lastrecord = sb->sb_mb; } /* * Implement receive operations on a socket. We depend on the way that * records are added to the sockbuf by sbappend. In particular, each record * (mbufs linked through m_next) must begin with an address if the protocol * so specifies, followed by an optional mbuf or mbufs containing ancillary * data, and then zero or more mbufs of data. In order to allow parallelism * between network receive and copying to user space, as well as avoid * sleeping with a mutex held, we release the socket buffer mutex during the * user space copy. Although the sockbuf is locked, new data may still be * appended, and thus we must maintain consistency of the sockbuf during that * time. * * The caller may receive the data as a single mbuf chain by supplying an * mbuf **mp0 for use in returning the chain. The uio is then used only for * the count in uio_resid. */ int soreceive_generic(struct socket *so, struct sockaddr **psa, struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp) { struct mbuf *m, **mp; int flags, error, offset; ssize_t len; struct protosw *pr = so->so_proto; struct mbuf *nextrecord; int moff, type = 0; ssize_t orig_resid = uio->uio_resid; mp = mp0; if (psa != NULL) *psa = NULL; if (controlp != NULL) *controlp = NULL; if (flagsp != NULL) flags = *flagsp &~ MSG_EOR; else flags = 0; if (flags & MSG_OOB) return (soreceive_rcvoob(so, uio, flags)); if (mp != NULL) *mp = NULL; if ((pr->pr_flags & PR_WANTRCVD) && (so->so_state & SS_ISCONFIRMING) && uio->uio_resid) { VNET_SO_ASSERT(so); (*pr->pr_usrreqs->pru_rcvd)(so, 0); } error = sblock(&so->so_rcv, SBLOCKWAIT(flags)); if (error) return (error); restart: SOCKBUF_LOCK(&so->so_rcv); m = so->so_rcv.sb_mb; /* * If we have less data than requested, block awaiting more (subject * to any timeout) if: * 1. the current count is less than the low water mark, or * 2. MSG_DONTWAIT is not set */ if (m == NULL || (((flags & MSG_DONTWAIT) == 0 && sbavail(&so->so_rcv) < uio->uio_resid) && sbavail(&so->so_rcv) < so->so_rcv.sb_lowat && m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) { KASSERT(m != NULL || !sbavail(&so->so_rcv), ("receive: m == %p sbavail == %u", m, sbavail(&so->so_rcv))); if (so->so_error) { if (m != NULL) goto dontblock; error = so->so_error; if ((flags & MSG_PEEK) == 0) so->so_error = 0; SOCKBUF_UNLOCK(&so->so_rcv); goto release; } SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { if (m != NULL) goto dontblock; #ifdef KERN_TLS else if (so->so_rcv.sb_tlsdcc == 0 && so->so_rcv.sb_tlscc == 0) { #else else { #endif SOCKBUF_UNLOCK(&so->so_rcv); goto release; } } for (; m != NULL; m = m->m_next) if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) { m = so->so_rcv.sb_mb; goto dontblock; } if ((so->so_state & (SS_ISCONNECTING | SS_ISCONNECTED | SS_ISDISCONNECTING | SS_ISDISCONNECTED)) == 0 && (so->so_proto->pr_flags & PR_CONNREQUIRED) != 0) { SOCKBUF_UNLOCK(&so->so_rcv); error = ENOTCONN; goto release; } if (uio->uio_resid == 0) { SOCKBUF_UNLOCK(&so->so_rcv); goto release; } if ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO))) { SOCKBUF_UNLOCK(&so->so_rcv); error = EWOULDBLOCK; goto release; } SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); error = sbwait(&so->so_rcv); SOCKBUF_UNLOCK(&so->so_rcv); if (error) goto release; goto restart; } dontblock: /* * From this point onward, we maintain 'nextrecord' as a cache of the * pointer to the next record in the socket buffer. We must keep the * various socket buffer pointers and local stack versions of the * pointers in sync, pushing out modifications before dropping the * socket buffer mutex, and re-reading them when picking it up. * * Otherwise, we will race with the network stack appending new data * or records onto the socket buffer by using inconsistent/stale * versions of the field, possibly resulting in socket buffer * corruption. * * By holding the high-level sblock(), we prevent simultaneous * readers from pulling off the front of the socket buffer. */ SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (uio->uio_td) uio->uio_td->td_ru.ru_msgrcv++; KASSERT(m == so->so_rcv.sb_mb, ("soreceive: m != so->so_rcv.sb_mb")); SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); nextrecord = m->m_nextpkt; if (pr->pr_flags & PR_ADDR) { KASSERT(m->m_type == MT_SONAME, ("m->m_type == %d", m->m_type)); orig_resid = 0; if (psa != NULL) *psa = sodupsockaddr(mtod(m, struct sockaddr *), M_NOWAIT); if (flags & MSG_PEEK) { m = m->m_next; } else { sbfree(&so->so_rcv, m); so->so_rcv.sb_mb = m_free(m); m = so->so_rcv.sb_mb; sockbuf_pushsync(&so->so_rcv, nextrecord); } } /* * Process one or more MT_CONTROL mbufs present before any data mbufs * in the first mbuf chain on the socket buffer. If MSG_PEEK, we * just copy the data; if !MSG_PEEK, we call into the protocol to * perform externalization (or freeing if controlp == NULL). */ if (m != NULL && m->m_type == MT_CONTROL) { struct mbuf *cm = NULL, *cmn; struct mbuf **cme = &cm; #ifdef KERN_TLS struct cmsghdr *cmsg; struct tls_get_record tgr; /* * For MSG_TLSAPPDATA, check for a non-application data * record. If found, return ENXIO without removing * it from the receive queue. This allows a subsequent * call without MSG_TLSAPPDATA to receive it. * Note that, for TLS, there should only be a single * control mbuf with the TLS_GET_RECORD message in it. */ if (flags & MSG_TLSAPPDATA) { cmsg = mtod(m, struct cmsghdr *); if (cmsg->cmsg_type == TLS_GET_RECORD && cmsg->cmsg_len == CMSG_LEN(sizeof(tgr))) { memcpy(&tgr, CMSG_DATA(cmsg), sizeof(tgr)); /* This will need to change for TLS 1.3. */ if (tgr.tls_type != TLS_RLTYPE_APP) { SOCKBUF_UNLOCK(&so->so_rcv); error = ENXIO; goto release; } } } #endif do { if (flags & MSG_PEEK) { if (controlp != NULL) { *controlp = m_copym(m, 0, m->m_len, M_NOWAIT); controlp = &(*controlp)->m_next; } m = m->m_next; } else { sbfree(&so->so_rcv, m); so->so_rcv.sb_mb = m->m_next; m->m_next = NULL; *cme = m; cme = &(*cme)->m_next; m = so->so_rcv.sb_mb; } } while (m != NULL && m->m_type == MT_CONTROL); if ((flags & MSG_PEEK) == 0) sockbuf_pushsync(&so->so_rcv, nextrecord); while (cm != NULL) { cmn = cm->m_next; cm->m_next = NULL; if (pr->pr_domain->dom_externalize != NULL) { SOCKBUF_UNLOCK(&so->so_rcv); VNET_SO_ASSERT(so); error = (*pr->pr_domain->dom_externalize) (cm, controlp, flags); SOCKBUF_LOCK(&so->so_rcv); } else if (controlp != NULL) *controlp = cm; else m_freem(cm); if (controlp != NULL) { orig_resid = 0; while (*controlp != NULL) controlp = &(*controlp)->m_next; } cm = cmn; } if (m != NULL) nextrecord = so->so_rcv.sb_mb->m_nextpkt; else nextrecord = so->so_rcv.sb_mb; orig_resid = 0; } if (m != NULL) { if ((flags & MSG_PEEK) == 0) { KASSERT(m->m_nextpkt == nextrecord, ("soreceive: post-control, nextrecord !sync")); if (nextrecord == NULL) { KASSERT(so->so_rcv.sb_mb == m, ("soreceive: post-control, sb_mb!=m")); KASSERT(so->so_rcv.sb_lastrecord == m, ("soreceive: post-control, lastrecord!=m")); } } type = m->m_type; if (type == MT_OOBDATA) flags |= MSG_OOB; } else { if ((flags & MSG_PEEK) == 0) { KASSERT(so->so_rcv.sb_mb == nextrecord, ("soreceive: sb_mb != nextrecord")); if (so->so_rcv.sb_mb == NULL) { KASSERT(so->so_rcv.sb_lastrecord == NULL, ("soreceive: sb_lastercord != NULL")); } } } SOCKBUF_LOCK_ASSERT(&so->so_rcv); SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); /* * Now continue to read any data mbufs off of the head of the socket * buffer until the read request is satisfied. Note that 'type' is * used to store the type of any mbuf reads that have happened so far * such that soreceive() can stop reading if the type changes, which * causes soreceive() to return only one of regular data and inline * out-of-band data in a single socket receive operation. */ moff = 0; offset = 0; while (m != NULL && !(m->m_flags & M_NOTAVAIL) && uio->uio_resid > 0 && error == 0) { /* * If the type of mbuf has changed since the last mbuf * examined ('type'), end the receive operation. */ SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (m->m_type == MT_OOBDATA || m->m_type == MT_CONTROL) { if (type != m->m_type) break; } else if (type == MT_OOBDATA) break; else KASSERT(m->m_type == MT_DATA, ("m->m_type == %d", m->m_type)); so->so_rcv.sb_state &= ~SBS_RCVATMARK; len = uio->uio_resid; if (so->so_oobmark && len > so->so_oobmark - offset) len = so->so_oobmark - offset; if (len > m->m_len - moff) len = m->m_len - moff; /* * If mp is set, just pass back the mbufs. Otherwise copy * them out via the uio, then free. Sockbuf must be * consistent here (points to current mbuf, it points to next * record) when we drop priority; we must note any additions * to the sockbuf when we block interrupts again. */ if (mp == NULL) { SOCKBUF_LOCK_ASSERT(&so->so_rcv); SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); SOCKBUF_UNLOCK(&so->so_rcv); if ((m->m_flags & M_EXTPG) != 0) error = m_unmapped_uiomove(m, moff, uio, (int)len); else error = uiomove(mtod(m, char *) + moff, (int)len, uio); SOCKBUF_LOCK(&so->so_rcv); if (error) { /* * The MT_SONAME mbuf has already been removed * from the record, so it is necessary to * remove the data mbufs, if any, to preserve * the invariant in the case of PR_ADDR that * requires MT_SONAME mbufs at the head of * each record. */ if (pr->pr_flags & PR_ATOMIC && ((flags & MSG_PEEK) == 0)) (void)sbdroprecord_locked(&so->so_rcv); SOCKBUF_UNLOCK(&so->so_rcv); goto release; } } else uio->uio_resid -= len; SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (len == m->m_len - moff) { if (m->m_flags & M_EOR) flags |= MSG_EOR; if (flags & MSG_PEEK) { m = m->m_next; moff = 0; } else { nextrecord = m->m_nextpkt; sbfree(&so->so_rcv, m); if (mp != NULL) { m->m_nextpkt = NULL; *mp = m; mp = &m->m_next; so->so_rcv.sb_mb = m = m->m_next; *mp = NULL; } else { so->so_rcv.sb_mb = m_free(m); m = so->so_rcv.sb_mb; } sockbuf_pushsync(&so->so_rcv, nextrecord); SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); } } else { if (flags & MSG_PEEK) moff += len; else { if (mp != NULL) { if (flags & MSG_DONTWAIT) { *mp = m_copym(m, 0, len, M_NOWAIT); if (*mp == NULL) { /* * m_copym() couldn't * allocate an mbuf. * Adjust uio_resid back * (it was adjusted * down by len bytes, * which we didn't end * up "copying" over). */ uio->uio_resid += len; break; } } else { SOCKBUF_UNLOCK(&so->so_rcv); *mp = m_copym(m, 0, len, M_WAITOK); SOCKBUF_LOCK(&so->so_rcv); } } sbcut_locked(&so->so_rcv, len); } } SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (so->so_oobmark) { if ((flags & MSG_PEEK) == 0) { so->so_oobmark -= len; if (so->so_oobmark == 0) { so->so_rcv.sb_state |= SBS_RCVATMARK; break; } } else { offset += len; if (offset == so->so_oobmark) break; } } if (flags & MSG_EOR) break; /* * If the MSG_WAITALL flag is set (for non-atomic socket), we * must not quit until "uio->uio_resid == 0" or an error * termination. If a signal/timeout occurs, return with a * short count but without error. Keep sockbuf locked * against other readers. */ while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 && !sosendallatonce(so) && nextrecord == NULL) { SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (so->so_error || so->so_rcv.sb_state & SBS_CANTRCVMORE) break; /* * Notify the protocol that some data has been * drained before blocking. */ if (pr->pr_flags & PR_WANTRCVD) { SOCKBUF_UNLOCK(&so->so_rcv); VNET_SO_ASSERT(so); (*pr->pr_usrreqs->pru_rcvd)(so, flags); SOCKBUF_LOCK(&so->so_rcv); } SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); /* * We could receive some data while was notifying * the protocol. Skip blocking in this case. */ if (so->so_rcv.sb_mb == NULL) { error = sbwait(&so->so_rcv); if (error) { SOCKBUF_UNLOCK(&so->so_rcv); goto release; } } m = so->so_rcv.sb_mb; if (m != NULL) nextrecord = m->m_nextpkt; } } SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (m != NULL && pr->pr_flags & PR_ATOMIC) { flags |= MSG_TRUNC; if ((flags & MSG_PEEK) == 0) (void) sbdroprecord_locked(&so->so_rcv); } if ((flags & MSG_PEEK) == 0) { if (m == NULL) { /* * First part is an inline SB_EMPTY_FIXUP(). Second * part makes sure sb_lastrecord is up-to-date if * there is still data in the socket buffer. */ so->so_rcv.sb_mb = nextrecord; if (so->so_rcv.sb_mb == NULL) { so->so_rcv.sb_mbtail = NULL; so->so_rcv.sb_lastrecord = NULL; } else if (nextrecord->m_nextpkt == NULL) so->so_rcv.sb_lastrecord = nextrecord; } SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); /* * If soreceive() is being done from the socket callback, * then don't need to generate ACK to peer to update window, * since ACK will be generated on return to TCP. */ if (!(flags & MSG_SOCALLBCK) && (pr->pr_flags & PR_WANTRCVD)) { SOCKBUF_UNLOCK(&so->so_rcv); VNET_SO_ASSERT(so); (*pr->pr_usrreqs->pru_rcvd)(so, flags); SOCKBUF_LOCK(&so->so_rcv); } } SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (orig_resid == uio->uio_resid && orig_resid && (flags & MSG_EOR) == 0 && (so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) { SOCKBUF_UNLOCK(&so->so_rcv); goto restart; } SOCKBUF_UNLOCK(&so->so_rcv); if (flagsp != NULL) *flagsp |= flags; release: sbunlock(&so->so_rcv); return (error); } /* * Optimized version of soreceive() for stream (TCP) sockets. */ int soreceive_stream(struct socket *so, struct sockaddr **psa, struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp) { int len = 0, error = 0, flags, oresid; struct sockbuf *sb; struct mbuf *m, *n = NULL; /* We only do stream sockets. */ if (so->so_type != SOCK_STREAM) return (EINVAL); if (psa != NULL) *psa = NULL; if (flagsp != NULL) flags = *flagsp &~ MSG_EOR; else flags = 0; if (controlp != NULL) *controlp = NULL; if (flags & MSG_OOB) return (soreceive_rcvoob(so, uio, flags)); if (mp0 != NULL) *mp0 = NULL; sb = &so->so_rcv; #ifdef KERN_TLS /* * KTLS store TLS records as records with a control message to * describe the framing. * * We check once here before acquiring locks to optimize the * common case. */ if (sb->sb_tls_info != NULL) return (soreceive_generic(so, psa, uio, mp0, controlp, flagsp)); #endif /* Prevent other readers from entering the socket. */ error = sblock(sb, SBLOCKWAIT(flags)); if (error) return (error); SOCKBUF_LOCK(sb); #ifdef KERN_TLS if (sb->sb_tls_info != NULL) { SOCKBUF_UNLOCK(sb); sbunlock(sb); return (soreceive_generic(so, psa, uio, mp0, controlp, flagsp)); } #endif /* Easy one, no space to copyout anything. */ if (uio->uio_resid == 0) { error = EINVAL; goto out; } oresid = uio->uio_resid; /* We will never ever get anything unless we are or were connected. */ if (!(so->so_state & (SS_ISCONNECTED|SS_ISDISCONNECTED))) { error = ENOTCONN; goto out; } restart: SOCKBUF_LOCK_ASSERT(&so->so_rcv); /* Abort if socket has reported problems. */ if (so->so_error) { if (sbavail(sb) > 0) goto deliver; if (oresid > uio->uio_resid) goto out; error = so->so_error; if (!(flags & MSG_PEEK)) so->so_error = 0; goto out; } /* Door is closed. Deliver what is left, if any. */ if (sb->sb_state & SBS_CANTRCVMORE) { if (sbavail(sb) > 0) goto deliver; else goto out; } /* Socket buffer is empty and we shall not block. */ if (sbavail(sb) == 0 && ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO)))) { error = EAGAIN; goto out; } /* Socket buffer got some data that we shall deliver now. */ if (sbavail(sb) > 0 && !(flags & MSG_WAITALL) && ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO)) || sbavail(sb) >= sb->sb_lowat || sbavail(sb) >= uio->uio_resid || sbavail(sb) >= sb->sb_hiwat) ) { goto deliver; } /* On MSG_WAITALL we must wait until all data or error arrives. */ if ((flags & MSG_WAITALL) && (sbavail(sb) >= uio->uio_resid || sbavail(sb) >= sb->sb_hiwat)) goto deliver; /* * Wait and block until (more) data comes in. * NB: Drops the sockbuf lock during wait. */ error = sbwait(sb); if (error) goto out; goto restart; deliver: SOCKBUF_LOCK_ASSERT(&so->so_rcv); KASSERT(sbavail(sb) > 0, ("%s: sockbuf empty", __func__)); KASSERT(sb->sb_mb != NULL, ("%s: sb_mb == NULL", __func__)); /* Statistics. */ if (uio->uio_td) uio->uio_td->td_ru.ru_msgrcv++; /* Fill uio until full or current end of socket buffer is reached. */ len = min(uio->uio_resid, sbavail(sb)); if (mp0 != NULL) { /* Dequeue as many mbufs as possible. */ if (!(flags & MSG_PEEK) && len >= sb->sb_mb->m_len) { if (*mp0 == NULL) *mp0 = sb->sb_mb; else m_cat(*mp0, sb->sb_mb); for (m = sb->sb_mb; m != NULL && m->m_len <= len; m = m->m_next) { KASSERT(!(m->m_flags & M_NOTAVAIL), ("%s: m %p not available", __func__, m)); len -= m->m_len; uio->uio_resid -= m->m_len; sbfree(sb, m); n = m; } n->m_next = NULL; sb->sb_mb = m; sb->sb_lastrecord = sb->sb_mb; if (sb->sb_mb == NULL) SB_EMPTY_FIXUP(sb); } /* Copy the remainder. */ if (len > 0) { KASSERT(sb->sb_mb != NULL, ("%s: len > 0 && sb->sb_mb empty", __func__)); m = m_copym(sb->sb_mb, 0, len, M_NOWAIT); if (m == NULL) len = 0; /* Don't flush data from sockbuf. */ else uio->uio_resid -= len; if (*mp0 != NULL) m_cat(*mp0, m); else *mp0 = m; if (*mp0 == NULL) { error = ENOBUFS; goto out; } } } else { /* NB: Must unlock socket buffer as uiomove may sleep. */ SOCKBUF_UNLOCK(sb); error = m_mbuftouio(uio, sb->sb_mb, len); SOCKBUF_LOCK(sb); if (error) goto out; } SBLASTRECORDCHK(sb); SBLASTMBUFCHK(sb); /* * Remove the delivered data from the socket buffer unless we * were only peeking. */ if (!(flags & MSG_PEEK)) { if (len > 0) sbdrop_locked(sb, len); /* Notify protocol that we drained some data. */ if ((so->so_proto->pr_flags & PR_WANTRCVD) && (((flags & MSG_WAITALL) && uio->uio_resid > 0) || !(flags & MSG_SOCALLBCK))) { SOCKBUF_UNLOCK(sb); VNET_SO_ASSERT(so); (*so->so_proto->pr_usrreqs->pru_rcvd)(so, flags); SOCKBUF_LOCK(sb); } } /* * For MSG_WAITALL we may have to loop again and wait for * more data to come in. */ if ((flags & MSG_WAITALL) && uio->uio_resid > 0) goto restart; out: SOCKBUF_LOCK_ASSERT(sb); SBLASTRECORDCHK(sb); SBLASTMBUFCHK(sb); SOCKBUF_UNLOCK(sb); sbunlock(sb); return (error); } /* * Optimized version of soreceive() for simple datagram cases from userspace. * Unlike in the stream case, we're able to drop a datagram if copyout() * fails, and because we handle datagrams atomically, we don't need to use a * sleep lock to prevent I/O interlacing. */ int soreceive_dgram(struct socket *so, struct sockaddr **psa, struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp) { struct mbuf *m, *m2; int flags, error; ssize_t len; struct protosw *pr = so->so_proto; struct mbuf *nextrecord; if (psa != NULL) *psa = NULL; if (controlp != NULL) *controlp = NULL; if (flagsp != NULL) flags = *flagsp &~ MSG_EOR; else flags = 0; /* * For any complicated cases, fall back to the full * soreceive_generic(). */ if (mp0 != NULL || (flags & MSG_PEEK) || (flags & MSG_OOB)) return (soreceive_generic(so, psa, uio, mp0, controlp, flagsp)); /* * Enforce restrictions on use. */ KASSERT((pr->pr_flags & PR_WANTRCVD) == 0, ("soreceive_dgram: wantrcvd")); KASSERT(pr->pr_flags & PR_ATOMIC, ("soreceive_dgram: !atomic")); KASSERT((so->so_rcv.sb_state & SBS_RCVATMARK) == 0, ("soreceive_dgram: SBS_RCVATMARK")); KASSERT((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0, ("soreceive_dgram: P_CONNREQUIRED")); /* * Loop blocking while waiting for a datagram. */ SOCKBUF_LOCK(&so->so_rcv); while ((m = so->so_rcv.sb_mb) == NULL) { KASSERT(sbavail(&so->so_rcv) == 0, ("soreceive_dgram: sb_mb NULL but sbavail %u", sbavail(&so->so_rcv))); if (so->so_error) { error = so->so_error; so->so_error = 0; SOCKBUF_UNLOCK(&so->so_rcv); return (error); } if (so->so_rcv.sb_state & SBS_CANTRCVMORE || uio->uio_resid == 0) { SOCKBUF_UNLOCK(&so->so_rcv); return (0); } if ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO))) { SOCKBUF_UNLOCK(&so->so_rcv); return (EWOULDBLOCK); } SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); error = sbwait(&so->so_rcv); if (error) { SOCKBUF_UNLOCK(&so->so_rcv); return (error); } } SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (uio->uio_td) uio->uio_td->td_ru.ru_msgrcv++; SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); nextrecord = m->m_nextpkt; if (nextrecord == NULL) { KASSERT(so->so_rcv.sb_lastrecord == m, ("soreceive_dgram: lastrecord != m")); } KASSERT(so->so_rcv.sb_mb->m_nextpkt == nextrecord, ("soreceive_dgram: m_nextpkt != nextrecord")); /* * Pull 'm' and its chain off the front of the packet queue. */ so->so_rcv.sb_mb = NULL; sockbuf_pushsync(&so->so_rcv, nextrecord); /* * Walk 'm's chain and free that many bytes from the socket buffer. */ for (m2 = m; m2 != NULL; m2 = m2->m_next) sbfree(&so->so_rcv, m2); /* * Do a few last checks before we let go of the lock. */ SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); SOCKBUF_UNLOCK(&so->so_rcv); if (pr->pr_flags & PR_ADDR) { KASSERT(m->m_type == MT_SONAME, ("m->m_type == %d", m->m_type)); if (psa != NULL) *psa = sodupsockaddr(mtod(m, struct sockaddr *), M_NOWAIT); m = m_free(m); } if (m == NULL) { /* XXXRW: Can this happen? */ return (0); } /* * Packet to copyout() is now in 'm' and it is disconnected from the * queue. * * Process one or more MT_CONTROL mbufs present before any data mbufs * in the first mbuf chain on the socket buffer. We call into the * protocol to perform externalization (or freeing if controlp == * NULL). In some cases there can be only MT_CONTROL mbufs without * MT_DATA mbufs. */ if (m->m_type == MT_CONTROL) { struct mbuf *cm = NULL, *cmn; struct mbuf **cme = &cm; do { m2 = m->m_next; m->m_next = NULL; *cme = m; cme = &(*cme)->m_next; m = m2; } while (m != NULL && m->m_type == MT_CONTROL); while (cm != NULL) { cmn = cm->m_next; cm->m_next = NULL; if (pr->pr_domain->dom_externalize != NULL) { error = (*pr->pr_domain->dom_externalize) (cm, controlp, flags); } else if (controlp != NULL) *controlp = cm; else m_freem(cm); if (controlp != NULL) { while (*controlp != NULL) controlp = &(*controlp)->m_next; } cm = cmn; } } KASSERT(m == NULL || m->m_type == MT_DATA, ("soreceive_dgram: !data")); while (m != NULL && uio->uio_resid > 0) { len = uio->uio_resid; if (len > m->m_len) len = m->m_len; error = uiomove(mtod(m, char *), (int)len, uio); if (error) { m_freem(m); return (error); } if (len == m->m_len) m = m_free(m); else { m->m_data += len; m->m_len -= len; } } if (m != NULL) { flags |= MSG_TRUNC; m_freem(m); } if (flagsp != NULL) *flagsp |= flags; return (0); } int soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp) { int error; CURVNET_SET(so->so_vnet); if (!SOLISTENING(so)) error = (so->so_proto->pr_usrreqs->pru_soreceive(so, psa, uio, mp0, controlp, flagsp)); else error = ENOTCONN; CURVNET_RESTORE(); return (error); } int soshutdown(struct socket *so, int how) { struct protosw *pr = so->so_proto; int error, soerror_enotconn; if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR)) return (EINVAL); soerror_enotconn = 0; if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) == 0) { /* * POSIX mandates us to return ENOTCONN when shutdown(2) is * invoked on a datagram sockets, however historically we would * actually tear socket down. This is known to be leveraged by * some applications to unblock process waiting in recvXXX(2) * by other process that it shares that socket with. Try to meet * both backward-compatibility and POSIX requirements by forcing * ENOTCONN but still asking protocol to perform pru_shutdown(). */ if (so->so_type != SOCK_DGRAM && !SOLISTENING(so)) return (ENOTCONN); soerror_enotconn = 1; } if (SOLISTENING(so)) { if (how != SHUT_WR) { SOLISTEN_LOCK(so); so->so_error = ECONNABORTED; solisten_wakeup(so); /* unlocks so */ } goto done; } CURVNET_SET(so->so_vnet); if (pr->pr_usrreqs->pru_flush != NULL) (*pr->pr_usrreqs->pru_flush)(so, how); if (how != SHUT_WR) sorflush(so); if (how != SHUT_RD) { error = (*pr->pr_usrreqs->pru_shutdown)(so); wakeup(&so->so_timeo); CURVNET_RESTORE(); return ((error == 0 && soerror_enotconn) ? ENOTCONN : error); } wakeup(&so->so_timeo); CURVNET_RESTORE(); done: return (soerror_enotconn ? ENOTCONN : 0); } void sorflush(struct socket *so) { struct sockbuf *sb = &so->so_rcv; struct protosw *pr = so->so_proto; struct socket aso; VNET_SO_ASSERT(so); /* * In order to avoid calling dom_dispose with the socket buffer mutex * held, and in order to generally avoid holding the lock for a long * time, we make a copy of the socket buffer and clear the original * (except locks, state). The new socket buffer copy won't have * initialized locks so we can only call routines that won't use or * assert those locks. * * Dislodge threads currently blocked in receive and wait to acquire * a lock against other simultaneous readers before clearing the * socket buffer. Don't let our acquire be interrupted by a signal * despite any existing socket disposition on interruptable waiting. */ socantrcvmore(so); (void) sblock(sb, SBL_WAIT | SBL_NOINTR); /* * Invalidate/clear most of the sockbuf structure, but leave selinfo * and mutex data unchanged. */ SOCKBUF_LOCK(sb); bzero(&aso, sizeof(aso)); aso.so_pcb = so->so_pcb; bcopy(&sb->sb_startzero, &aso.so_rcv.sb_startzero, sizeof(*sb) - offsetof(struct sockbuf, sb_startzero)); bzero(&sb->sb_startzero, sizeof(*sb) - offsetof(struct sockbuf, sb_startzero)); SOCKBUF_UNLOCK(sb); sbunlock(sb); /* * Dispose of special rights and flush the copied socket. Don't call * any unsafe routines (that rely on locks being initialized) on aso. */ if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL) (*pr->pr_domain->dom_dispose)(&aso); sbrelease_internal(&aso.so_rcv, so); } /* * Wrapper for Socket established helper hook. * Parameters: socket, context of the hook point, hook id. */ static int inline hhook_run_socket(struct socket *so, void *hctx, int32_t h_id) { struct socket_hhook_data hhook_data = { .so = so, .hctx = hctx, .m = NULL, .status = 0 }; CURVNET_SET(so->so_vnet); HHOOKS_RUN_IF(V_socket_hhh[h_id], &hhook_data, &so->osd); CURVNET_RESTORE(); /* Ugly but needed, since hhooks return void for now */ return (hhook_data.status); } /* * Perhaps this routine, and sooptcopyout(), below, ought to come in an * additional variant to handle the case where the option value needs to be * some kind of integer, but not a specific size. In addition to their use * here, these functions are also called by the protocol-level pr_ctloutput() * routines. */ int sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen) { size_t valsize; /* * If the user gives us more than we wanted, we ignore it, but if we * don't get the minimum length the caller wants, we return EINVAL. * On success, sopt->sopt_valsize is set to however much we actually * retrieved. */ if ((valsize = sopt->sopt_valsize) < minlen) return EINVAL; if (valsize > len) sopt->sopt_valsize = valsize = len; if (sopt->sopt_td != NULL) return (copyin(sopt->sopt_val, buf, valsize)); bcopy(sopt->sopt_val, buf, valsize); return (0); } /* * Kernel version of setsockopt(2). * * XXX: optlen is size_t, not socklen_t */ int so_setsockopt(struct socket *so, int level, int optname, void *optval, size_t optlen) { struct sockopt sopt; sopt.sopt_level = level; sopt.sopt_name = optname; sopt.sopt_dir = SOPT_SET; sopt.sopt_val = optval; sopt.sopt_valsize = optlen; sopt.sopt_td = NULL; return (sosetopt(so, &sopt)); } int sosetopt(struct socket *so, struct sockopt *sopt) { int error, optval; struct linger l; struct timeval tv; sbintime_t val; uint32_t val32; #ifdef MAC struct mac extmac; #endif CURVNET_SET(so->so_vnet); error = 0; if (sopt->sopt_level != SOL_SOCKET) { if (so->so_proto->pr_ctloutput != NULL) error = (*so->so_proto->pr_ctloutput)(so, sopt); else error = ENOPROTOOPT; } else { switch (sopt->sopt_name) { case SO_ACCEPTFILTER: error = accept_filt_setopt(so, sopt); if (error) goto bad; break; case SO_LINGER: error = sooptcopyin(sopt, &l, sizeof l, sizeof l); if (error) goto bad; if (l.l_linger < 0 || l.l_linger > USHRT_MAX || l.l_linger > (INT_MAX / hz)) { error = EDOM; goto bad; } SOCK_LOCK(so); so->so_linger = l.l_linger; if (l.l_onoff) so->so_options |= SO_LINGER; else so->so_options &= ~SO_LINGER; SOCK_UNLOCK(so); break; case SO_DEBUG: case SO_KEEPALIVE: case SO_DONTROUTE: case SO_USELOOPBACK: case SO_BROADCAST: case SO_REUSEADDR: case SO_REUSEPORT: case SO_REUSEPORT_LB: case SO_OOBINLINE: case SO_TIMESTAMP: case SO_BINTIME: case SO_NOSIGPIPE: case SO_NO_DDP: case SO_NO_OFFLOAD: error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) goto bad; SOCK_LOCK(so); if (optval) so->so_options |= sopt->sopt_name; else so->so_options &= ~sopt->sopt_name; SOCK_UNLOCK(so); break; case SO_SETFIB: error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) goto bad; if (optval < 0 || optval >= rt_numfibs) { error = EINVAL; goto bad; } if (((so->so_proto->pr_domain->dom_family == PF_INET) || (so->so_proto->pr_domain->dom_family == PF_INET6) || (so->so_proto->pr_domain->dom_family == PF_ROUTE))) so->so_fibnum = optval; else so->so_fibnum = 0; break; case SO_USER_COOKIE: error = sooptcopyin(sopt, &val32, sizeof val32, sizeof val32); if (error) goto bad; so->so_user_cookie = val32; break; case SO_SNDBUF: case SO_RCVBUF: case SO_SNDLOWAT: case SO_RCVLOWAT: error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) goto bad; /* * Values < 1 make no sense for any of these options, * so disallow them. */ if (optval < 1) { error = EINVAL; goto bad; } error = sbsetopt(so, sopt->sopt_name, optval); break; case SO_SNDTIMEO: case SO_RCVTIMEO: #ifdef COMPAT_FREEBSD32 if (SV_CURPROC_FLAG(SV_ILP32)) { struct timeval32 tv32; error = sooptcopyin(sopt, &tv32, sizeof tv32, sizeof tv32); CP(tv32, tv, tv_sec); CP(tv32, tv, tv_usec); } else #endif error = sooptcopyin(sopt, &tv, sizeof tv, sizeof tv); if (error) goto bad; if (tv.tv_sec < 0 || tv.tv_usec < 0 || tv.tv_usec >= 1000000) { error = EDOM; goto bad; } if (tv.tv_sec > INT32_MAX) val = SBT_MAX; else val = tvtosbt(tv); switch (sopt->sopt_name) { case SO_SNDTIMEO: so->so_snd.sb_timeo = val; break; case SO_RCVTIMEO: so->so_rcv.sb_timeo = val; break; } break; case SO_LABEL: #ifdef MAC error = sooptcopyin(sopt, &extmac, sizeof extmac, sizeof extmac); if (error) goto bad; error = mac_setsockopt_label(sopt->sopt_td->td_ucred, so, &extmac); #else error = EOPNOTSUPP; #endif break; case SO_TS_CLOCK: error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) goto bad; if (optval < 0 || optval > SO_TS_CLOCK_MAX) { error = EINVAL; goto bad; } so->so_ts_clock = optval; break; case SO_MAX_PACING_RATE: error = sooptcopyin(sopt, &val32, sizeof(val32), sizeof(val32)); if (error) goto bad; so->so_max_pacing_rate = val32; break; default: if (V_socket_hhh[HHOOK_SOCKET_OPT]->hhh_nhooks > 0) error = hhook_run_socket(so, sopt, HHOOK_SOCKET_OPT); else error = ENOPROTOOPT; break; } if (error == 0 && so->so_proto->pr_ctloutput != NULL) (void)(*so->so_proto->pr_ctloutput)(so, sopt); } bad: CURVNET_RESTORE(); return (error); } /* * Helper routine for getsockopt. */ int sooptcopyout(struct sockopt *sopt, const void *buf, size_t len) { int error; size_t valsize; error = 0; /* * Documented get behavior is that we always return a value, possibly * truncated to fit in the user's buffer. Traditional behavior is * that we always tell the user precisely how much we copied, rather * than something useful like the total amount we had available for * her. Note that this interface is not idempotent; the entire * answer must be generated ahead of time. */ valsize = min(len, sopt->sopt_valsize); sopt->sopt_valsize = valsize; if (sopt->sopt_val != NULL) { if (sopt->sopt_td != NULL) error = copyout(buf, sopt->sopt_val, valsize); else bcopy(buf, sopt->sopt_val, valsize); } return (error); } int sogetopt(struct socket *so, struct sockopt *sopt) { int error, optval; struct linger l; struct timeval tv; #ifdef MAC struct mac extmac; #endif CURVNET_SET(so->so_vnet); error = 0; if (sopt->sopt_level != SOL_SOCKET) { if (so->so_proto->pr_ctloutput != NULL) error = (*so->so_proto->pr_ctloutput)(so, sopt); else error = ENOPROTOOPT; CURVNET_RESTORE(); return (error); } else { switch (sopt->sopt_name) { case SO_ACCEPTFILTER: error = accept_filt_getopt(so, sopt); break; case SO_LINGER: SOCK_LOCK(so); l.l_onoff = so->so_options & SO_LINGER; l.l_linger = so->so_linger; SOCK_UNLOCK(so); error = sooptcopyout(sopt, &l, sizeof l); break; case SO_USELOOPBACK: case SO_DONTROUTE: case SO_DEBUG: case SO_KEEPALIVE: case SO_REUSEADDR: case SO_REUSEPORT: case SO_REUSEPORT_LB: case SO_BROADCAST: case SO_OOBINLINE: case SO_ACCEPTCONN: case SO_TIMESTAMP: case SO_BINTIME: case SO_NOSIGPIPE: case SO_NO_DDP: case SO_NO_OFFLOAD: optval = so->so_options & sopt->sopt_name; integer: error = sooptcopyout(sopt, &optval, sizeof optval); break; case SO_DOMAIN: optval = so->so_proto->pr_domain->dom_family; goto integer; case SO_TYPE: optval = so->so_type; goto integer; case SO_PROTOCOL: optval = so->so_proto->pr_protocol; goto integer; case SO_ERROR: SOCK_LOCK(so); optval = so->so_error; so->so_error = 0; SOCK_UNLOCK(so); goto integer; case SO_SNDBUF: optval = SOLISTENING(so) ? so->sol_sbsnd_hiwat : so->so_snd.sb_hiwat; goto integer; case SO_RCVBUF: optval = SOLISTENING(so) ? so->sol_sbrcv_hiwat : so->so_rcv.sb_hiwat; goto integer; case SO_SNDLOWAT: optval = SOLISTENING(so) ? so->sol_sbsnd_lowat : so->so_snd.sb_lowat; goto integer; case SO_RCVLOWAT: optval = SOLISTENING(so) ? so->sol_sbrcv_lowat : so->so_rcv.sb_lowat; goto integer; case SO_SNDTIMEO: case SO_RCVTIMEO: tv = sbttotv(sopt->sopt_name == SO_SNDTIMEO ? so->so_snd.sb_timeo : so->so_rcv.sb_timeo); #ifdef COMPAT_FREEBSD32 if (SV_CURPROC_FLAG(SV_ILP32)) { struct timeval32 tv32; CP(tv, tv32, tv_sec); CP(tv, tv32, tv_usec); error = sooptcopyout(sopt, &tv32, sizeof tv32); } else #endif error = sooptcopyout(sopt, &tv, sizeof tv); break; case SO_LABEL: #ifdef MAC error = sooptcopyin(sopt, &extmac, sizeof(extmac), sizeof(extmac)); if (error) goto bad; error = mac_getsockopt_label(sopt->sopt_td->td_ucred, so, &extmac); if (error) goto bad; error = sooptcopyout(sopt, &extmac, sizeof extmac); #else error = EOPNOTSUPP; #endif break; case SO_PEERLABEL: #ifdef MAC error = sooptcopyin(sopt, &extmac, sizeof(extmac), sizeof(extmac)); if (error) goto bad; error = mac_getsockopt_peerlabel( sopt->sopt_td->td_ucred, so, &extmac); if (error) goto bad; error = sooptcopyout(sopt, &extmac, sizeof extmac); #else error = EOPNOTSUPP; #endif break; case SO_LISTENQLIMIT: optval = SOLISTENING(so) ? so->sol_qlimit : 0; goto integer; case SO_LISTENQLEN: optval = SOLISTENING(so) ? so->sol_qlen : 0; goto integer; case SO_LISTENINCQLEN: optval = SOLISTENING(so) ? so->sol_incqlen : 0; goto integer; case SO_TS_CLOCK: optval = so->so_ts_clock; goto integer; case SO_MAX_PACING_RATE: optval = so->so_max_pacing_rate; goto integer; default: if (V_socket_hhh[HHOOK_SOCKET_OPT]->hhh_nhooks > 0) error = hhook_run_socket(so, sopt, HHOOK_SOCKET_OPT); else error = ENOPROTOOPT; break; } } #ifdef MAC bad: #endif CURVNET_RESTORE(); return (error); } int soopt_getm(struct sockopt *sopt, struct mbuf **mp) { struct mbuf *m, *m_prev; int sopt_size = sopt->sopt_valsize; MGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT, MT_DATA); if (m == NULL) return ENOBUFS; if (sopt_size > MLEN) { MCLGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT); if ((m->m_flags & M_EXT) == 0) { m_free(m); return ENOBUFS; } m->m_len = min(MCLBYTES, sopt_size); } else { m->m_len = min(MLEN, sopt_size); } sopt_size -= m->m_len; *mp = m; m_prev = m; while (sopt_size) { MGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT, MT_DATA); if (m == NULL) { m_freem(*mp); return ENOBUFS; } if (sopt_size > MLEN) { MCLGET(m, sopt->sopt_td != NULL ? M_WAITOK : M_NOWAIT); if ((m->m_flags & M_EXT) == 0) { m_freem(m); m_freem(*mp); return ENOBUFS; } m->m_len = min(MCLBYTES, sopt_size); } else { m->m_len = min(MLEN, sopt_size); } sopt_size -= m->m_len; m_prev->m_next = m; m_prev = m; } return (0); } int soopt_mcopyin(struct sockopt *sopt, struct mbuf *m) { struct mbuf *m0 = m; if (sopt->sopt_val == NULL) return (0); while (m != NULL && sopt->sopt_valsize >= m->m_len) { if (sopt->sopt_td != NULL) { int error; error = copyin(sopt->sopt_val, mtod(m, char *), m->m_len); if (error != 0) { m_freem(m0); return(error); } } else bcopy(sopt->sopt_val, mtod(m, char *), m->m_len); sopt->sopt_valsize -= m->m_len; sopt->sopt_val = (char *)sopt->sopt_val + m->m_len; m = m->m_next; } if (m != NULL) /* should be allocated enoughly at ip6_sooptmcopyin() */ panic("ip6_sooptmcopyin"); return (0); } int soopt_mcopyout(struct sockopt *sopt, struct mbuf *m) { struct mbuf *m0 = m; size_t valsize = 0; if (sopt->sopt_val == NULL) return (0); while (m != NULL && sopt->sopt_valsize >= m->m_len) { if (sopt->sopt_td != NULL) { int error; error = copyout(mtod(m, char *), sopt->sopt_val, m->m_len); if (error != 0) { m_freem(m0); return(error); } } else bcopy(mtod(m, char *), sopt->sopt_val, m->m_len); sopt->sopt_valsize -= m->m_len; sopt->sopt_val = (char *)sopt->sopt_val + m->m_len; valsize += m->m_len; m = m->m_next; } if (m != NULL) { /* enough soopt buffer should be given from user-land */ m_freem(m0); return(EINVAL); } sopt->sopt_valsize = valsize; return (0); } /* * sohasoutofband(): protocol notifies socket layer of the arrival of new * out-of-band data, which will then notify socket consumers. */ void sohasoutofband(struct socket *so) { if (so->so_sigio != NULL) pgsigio(&so->so_sigio, SIGURG, 0); selwakeuppri(&so->so_rdsel, PSOCK); } int sopoll(struct socket *so, int events, struct ucred *active_cred, struct thread *td) { /* * We do not need to set or assert curvnet as long as everyone uses * sopoll_generic(). */ return (so->so_proto->pr_usrreqs->pru_sopoll(so, events, active_cred, td)); } int sopoll_generic(struct socket *so, int events, struct ucred *active_cred, struct thread *td) { int revents; SOCK_LOCK(so); if (SOLISTENING(so)) { if (!(events & (POLLIN | POLLRDNORM))) revents = 0; else if (!TAILQ_EMPTY(&so->sol_comp)) revents = events & (POLLIN | POLLRDNORM); else if ((events & POLLINIGNEOF) == 0 && so->so_error) revents = (events & (POLLIN | POLLRDNORM)) | POLLHUP; else { selrecord(td, &so->so_rdsel); revents = 0; } } else { revents = 0; SOCKBUF_LOCK(&so->so_snd); SOCKBUF_LOCK(&so->so_rcv); if (events & (POLLIN | POLLRDNORM)) if (soreadabledata(so)) revents |= events & (POLLIN | POLLRDNORM); if (events & (POLLOUT | POLLWRNORM)) if (sowriteable(so)) revents |= events & (POLLOUT | POLLWRNORM); if (events & (POLLPRI | POLLRDBAND)) if (so->so_oobmark || (so->so_rcv.sb_state & SBS_RCVATMARK)) revents |= events & (POLLPRI | POLLRDBAND); if ((events & POLLINIGNEOF) == 0) { if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { revents |= events & (POLLIN | POLLRDNORM); if (so->so_snd.sb_state & SBS_CANTSENDMORE) revents |= POLLHUP; } } if (so->so_rcv.sb_state & SBS_CANTRCVMORE) revents |= events & POLLRDHUP; if (revents == 0) { if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND | POLLRDHUP)) { selrecord(td, &so->so_rdsel); so->so_rcv.sb_flags |= SB_SEL; } if (events & (POLLOUT | POLLWRNORM)) { selrecord(td, &so->so_wrsel); so->so_snd.sb_flags |= SB_SEL; } } SOCKBUF_UNLOCK(&so->so_rcv); SOCKBUF_UNLOCK(&so->so_snd); } SOCK_UNLOCK(so); return (revents); } int soo_kqfilter(struct file *fp, struct knote *kn) { struct socket *so = kn->kn_fp->f_data; struct sockbuf *sb; struct knlist *knl; switch (kn->kn_filter) { case EVFILT_READ: kn->kn_fop = &soread_filtops; knl = &so->so_rdsel.si_note; sb = &so->so_rcv; break; case EVFILT_WRITE: kn->kn_fop = &sowrite_filtops; knl = &so->so_wrsel.si_note; sb = &so->so_snd; break; case EVFILT_EMPTY: kn->kn_fop = &soempty_filtops; knl = &so->so_wrsel.si_note; sb = &so->so_snd; break; default: return (EINVAL); } SOCK_LOCK(so); if (SOLISTENING(so)) { knlist_add(knl, kn, 1); } else { SOCKBUF_LOCK(sb); knlist_add(knl, kn, 1); sb->sb_flags |= SB_KNOTE; SOCKBUF_UNLOCK(sb); } SOCK_UNLOCK(so); return (0); } /* * Some routines that return EOPNOTSUPP for entry points that are not * supported by a protocol. Fill in as needed. */ int pru_accept_notsupp(struct socket *so, struct sockaddr **nam) { return EOPNOTSUPP; } int pru_aio_queue_notsupp(struct socket *so, struct kaiocb *job) { return EOPNOTSUPP; } int pru_attach_notsupp(struct socket *so, int proto, struct thread *td) { return EOPNOTSUPP; } int pru_bind_notsupp(struct socket *so, struct sockaddr *nam, struct thread *td) { return EOPNOTSUPP; } int pru_bindat_notsupp(int fd, struct socket *so, struct sockaddr *nam, struct thread *td) { return EOPNOTSUPP; } int pru_connect_notsupp(struct socket *so, struct sockaddr *nam, struct thread *td) { return EOPNOTSUPP; } int pru_connectat_notsupp(int fd, struct socket *so, struct sockaddr *nam, struct thread *td) { return EOPNOTSUPP; } int pru_connect2_notsupp(struct socket *so1, struct socket *so2) { return EOPNOTSUPP; } int pru_control_notsupp(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, struct thread *td) { return EOPNOTSUPP; } int pru_disconnect_notsupp(struct socket *so) { return EOPNOTSUPP; } int pru_listen_notsupp(struct socket *so, int backlog, struct thread *td) { return EOPNOTSUPP; } int pru_peeraddr_notsupp(struct socket *so, struct sockaddr **nam) { return EOPNOTSUPP; } int pru_rcvd_notsupp(struct socket *so, int flags) { return EOPNOTSUPP; } int pru_rcvoob_notsupp(struct socket *so, struct mbuf *m, int flags) { return EOPNOTSUPP; } int pru_send_notsupp(struct socket *so, int flags, struct mbuf *m, struct sockaddr *addr, struct mbuf *control, struct thread *td) { if (control != NULL) m_freem(control); if ((flags & PRUS_NOTREADY) == 0) m_freem(m); return (EOPNOTSUPP); } int pru_ready_notsupp(struct socket *so, struct mbuf *m, int count) { return (EOPNOTSUPP); } /* * This isn't really a ``null'' operation, but it's the default one and * doesn't do anything destructive. */ int pru_sense_null(struct socket *so, struct stat *sb) { sb->st_blksize = so->so_snd.sb_hiwat; return 0; } int pru_shutdown_notsupp(struct socket *so) { return EOPNOTSUPP; } int pru_sockaddr_notsupp(struct socket *so, struct sockaddr **nam) { return EOPNOTSUPP; } int pru_sosend_notsupp(struct socket *so, struct sockaddr *addr, struct uio *uio, struct mbuf *top, struct mbuf *control, int flags, struct thread *td) { return EOPNOTSUPP; } int pru_soreceive_notsupp(struct socket *so, struct sockaddr **paddr, struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp) { return EOPNOTSUPP; } int pru_sopoll_notsupp(struct socket *so, int events, struct ucred *cred, struct thread *td) { return EOPNOTSUPP; } static void filt_sordetach(struct knote *kn) { struct socket *so = kn->kn_fp->f_data; so_rdknl_lock(so); knlist_remove(&so->so_rdsel.si_note, kn, 1); if (!SOLISTENING(so) && knlist_empty(&so->so_rdsel.si_note)) so->so_rcv.sb_flags &= ~SB_KNOTE; so_rdknl_unlock(so); } /*ARGSUSED*/ static int filt_soread(struct knote *kn, long hint) { struct socket *so; so = kn->kn_fp->f_data; if (SOLISTENING(so)) { SOCK_LOCK_ASSERT(so); kn->kn_data = so->sol_qlen; if (so->so_error) { kn->kn_flags |= EV_EOF; kn->kn_fflags = so->so_error; return (1); } return (!TAILQ_EMPTY(&so->sol_comp)); } SOCKBUF_LOCK_ASSERT(&so->so_rcv); kn->kn_data = sbavail(&so->so_rcv) - so->so_rcv.sb_ctl; if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { kn->kn_flags |= EV_EOF; kn->kn_fflags = so->so_error; return (1); } else if (so->so_error) /* temporary udp error */ return (1); if (kn->kn_sfflags & NOTE_LOWAT) { if (kn->kn_data >= kn->kn_sdata) return (1); } else if (sbavail(&so->so_rcv) >= so->so_rcv.sb_lowat) return (1); /* This hook returning non-zero indicates an event, not error */ return (hhook_run_socket(so, NULL, HHOOK_FILT_SOREAD)); } static void filt_sowdetach(struct knote *kn) { struct socket *so = kn->kn_fp->f_data; so_wrknl_lock(so); knlist_remove(&so->so_wrsel.si_note, kn, 1); if (!SOLISTENING(so) && knlist_empty(&so->so_wrsel.si_note)) so->so_snd.sb_flags &= ~SB_KNOTE; so_wrknl_unlock(so); } /*ARGSUSED*/ static int filt_sowrite(struct knote *kn, long hint) { struct socket *so; so = kn->kn_fp->f_data; if (SOLISTENING(so)) return (0); SOCKBUF_LOCK_ASSERT(&so->so_snd); kn->kn_data = sbspace(&so->so_snd); hhook_run_socket(so, kn, HHOOK_FILT_SOWRITE); if (so->so_snd.sb_state & SBS_CANTSENDMORE) { kn->kn_flags |= EV_EOF; kn->kn_fflags = so->so_error; return (1); } else if (so->so_error) /* temporary udp error */ return (1); else if (((so->so_state & SS_ISCONNECTED) == 0) && (so->so_proto->pr_flags & PR_CONNREQUIRED)) return (0); else if (kn->kn_sfflags & NOTE_LOWAT) return (kn->kn_data >= kn->kn_sdata); else return (kn->kn_data >= so->so_snd.sb_lowat); } static int filt_soempty(struct knote *kn, long hint) { struct socket *so; so = kn->kn_fp->f_data; if (SOLISTENING(so)) return (1); SOCKBUF_LOCK_ASSERT(&so->so_snd); kn->kn_data = sbused(&so->so_snd); if (kn->kn_data == 0) return (1); else return (0); } int socheckuid(struct socket *so, uid_t uid) { if (so == NULL) return (EPERM); if (so->so_cred->cr_uid != uid) return (EPERM); return (0); } /* * These functions are used by protocols to notify the socket layer (and its * consumers) of state changes in the sockets driven by protocol-side events. */ /* * Procedures to manipulate state flags of socket and do appropriate wakeups. * * Normal sequence from the active (originating) side is that * soisconnecting() is called during processing of connect() call, resulting * in an eventual call to soisconnected() if/when the connection is * established. When the connection is torn down soisdisconnecting() is * called during processing of disconnect() call, and soisdisconnected() is * called when the connection to the peer is totally severed. The semantics * of these routines are such that connectionless protocols can call * soisconnected() and soisdisconnected() only, bypassing the in-progress * calls when setting up a ``connection'' takes no time. * * From the passive side, a socket is created with two queues of sockets: * so_incomp for connections in progress and so_comp for connections already * made and awaiting user acceptance. As a protocol is preparing incoming * connections, it creates a socket structure queued on so_incomp by calling * sonewconn(). When the connection is established, soisconnected() is * called, and transfers the socket structure to so_comp, making it available * to accept(). * * If a socket is closed with sockets on either so_incomp or so_comp, these * sockets are dropped. * * If higher-level protocols are implemented in the kernel, the wakeups done * here will sometimes cause software-interrupt process scheduling. */ void soisconnecting(struct socket *so) { SOCK_LOCK(so); so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING); so->so_state |= SS_ISCONNECTING; SOCK_UNLOCK(so); } void soisconnected(struct socket *so) { SOCK_LOCK(so); so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING|SS_ISCONFIRMING); so->so_state |= SS_ISCONNECTED; if (so->so_qstate == SQ_INCOMP) { struct socket *head = so->so_listen; int ret; KASSERT(head, ("%s: so %p on incomp of NULL", __func__, so)); /* * Promoting a socket from incomplete queue to complete, we * need to go through reverse order of locking. We first do * trylock, and if that doesn't succeed, we go the hard way * leaving a reference and rechecking consistency after proper * locking. */ if (__predict_false(SOLISTEN_TRYLOCK(head) == 0)) { soref(head); SOCK_UNLOCK(so); SOLISTEN_LOCK(head); SOCK_LOCK(so); if (__predict_false(head != so->so_listen)) { /* * The socket went off the listen queue, * should be lost race to close(2) of sol. * The socket is about to soabort(). */ SOCK_UNLOCK(so); sorele(head); return; } /* Not the last one, as so holds a ref. */ refcount_release(&head->so_count); } again: if ((so->so_options & SO_ACCEPTFILTER) == 0) { TAILQ_REMOVE(&head->sol_incomp, so, so_list); head->sol_incqlen--; TAILQ_INSERT_TAIL(&head->sol_comp, so, so_list); head->sol_qlen++; so->so_qstate = SQ_COMP; SOCK_UNLOCK(so); solisten_wakeup(head); /* unlocks */ } else { SOCKBUF_LOCK(&so->so_rcv); soupcall_set(so, SO_RCV, head->sol_accept_filter->accf_callback, head->sol_accept_filter_arg); so->so_options &= ~SO_ACCEPTFILTER; ret = head->sol_accept_filter->accf_callback(so, head->sol_accept_filter_arg, M_NOWAIT); if (ret == SU_ISCONNECTED) { soupcall_clear(so, SO_RCV); SOCKBUF_UNLOCK(&so->so_rcv); goto again; } SOCKBUF_UNLOCK(&so->so_rcv); SOCK_UNLOCK(so); SOLISTEN_UNLOCK(head); } return; } SOCK_UNLOCK(so); wakeup(&so->so_timeo); sorwakeup(so); sowwakeup(so); } void soisdisconnecting(struct socket *so) { SOCK_LOCK(so); so->so_state &= ~SS_ISCONNECTING; so->so_state |= SS_ISDISCONNECTING; if (!SOLISTENING(so)) { SOCKBUF_LOCK(&so->so_rcv); socantrcvmore_locked(so); SOCKBUF_LOCK(&so->so_snd); socantsendmore_locked(so); } SOCK_UNLOCK(so); wakeup(&so->so_timeo); } void soisdisconnected(struct socket *so) { SOCK_LOCK(so); /* * There is at least one reader of so_state that does not * acquire socket lock, namely soreceive_generic(). Ensure * that it never sees all flags that track connection status * cleared, by ordering the update with a barrier semantic of * our release thread fence. */ so->so_state |= SS_ISDISCONNECTED; atomic_thread_fence_rel(); so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING); if (!SOLISTENING(so)) { SOCK_UNLOCK(so); SOCKBUF_LOCK(&so->so_rcv); socantrcvmore_locked(so); SOCKBUF_LOCK(&so->so_snd); sbdrop_locked(&so->so_snd, sbused(&so->so_snd)); socantsendmore_locked(so); } else SOCK_UNLOCK(so); wakeup(&so->so_timeo); } /* * Make a copy of a sockaddr in a malloced buffer of type M_SONAME. */ struct sockaddr * sodupsockaddr(const struct sockaddr *sa, int mflags) { struct sockaddr *sa2; sa2 = malloc(sa->sa_len, M_SONAME, mflags); if (sa2) bcopy(sa, sa2, sa->sa_len); return sa2; } /* * Register per-socket destructor. */ void sodtor_set(struct socket *so, so_dtor_t *func) { SOCK_LOCK_ASSERT(so); so->so_dtor = func; } /* * Register per-socket buffer upcalls. */ void soupcall_set(struct socket *so, int which, so_upcall_t func, void *arg) { struct sockbuf *sb; KASSERT(!SOLISTENING(so), ("%s: so %p listening", __func__, so)); switch (which) { case SO_RCV: sb = &so->so_rcv; break; case SO_SND: sb = &so->so_snd; break; default: panic("soupcall_set: bad which"); } SOCKBUF_LOCK_ASSERT(sb); sb->sb_upcall = func; sb->sb_upcallarg = arg; sb->sb_flags |= SB_UPCALL; } void soupcall_clear(struct socket *so, int which) { struct sockbuf *sb; KASSERT(!SOLISTENING(so), ("%s: so %p listening", __func__, so)); switch (which) { case SO_RCV: sb = &so->so_rcv; break; case SO_SND: sb = &so->so_snd; break; default: panic("soupcall_clear: bad which"); } SOCKBUF_LOCK_ASSERT(sb); KASSERT(sb->sb_upcall != NULL, ("%s: so %p no upcall to clear", __func__, so)); sb->sb_upcall = NULL; sb->sb_upcallarg = NULL; sb->sb_flags &= ~SB_UPCALL; } void solisten_upcall_set(struct socket *so, so_upcall_t func, void *arg) { SOLISTEN_LOCK_ASSERT(so); so->sol_upcall = func; so->sol_upcallarg = arg; } static void so_rdknl_lock(void *arg) { struct socket *so = arg; if (SOLISTENING(so)) SOCK_LOCK(so); else SOCKBUF_LOCK(&so->so_rcv); } static void so_rdknl_unlock(void *arg) { struct socket *so = arg; if (SOLISTENING(so)) SOCK_UNLOCK(so); else SOCKBUF_UNLOCK(&so->so_rcv); } static void so_rdknl_assert_lock(void *arg, int what) { struct socket *so = arg; if (what == LA_LOCKED) { if (SOLISTENING(so)) SOCK_LOCK_ASSERT(so); else SOCKBUF_LOCK_ASSERT(&so->so_rcv); } else { if (SOLISTENING(so)) SOCK_UNLOCK_ASSERT(so); else SOCKBUF_UNLOCK_ASSERT(&so->so_rcv); } } static void so_wrknl_lock(void *arg) { struct socket *so = arg; if (SOLISTENING(so)) SOCK_LOCK(so); else SOCKBUF_LOCK(&so->so_snd); } static void so_wrknl_unlock(void *arg) { struct socket *so = arg; if (SOLISTENING(so)) SOCK_UNLOCK(so); else SOCKBUF_UNLOCK(&so->so_snd); } static void so_wrknl_assert_lock(void *arg, int what) { struct socket *so = arg; if (what == LA_LOCKED) { if (SOLISTENING(so)) SOCK_LOCK_ASSERT(so); else SOCKBUF_LOCK_ASSERT(&so->so_snd); } else { if (SOLISTENING(so)) SOCK_UNLOCK_ASSERT(so); else SOCKBUF_UNLOCK_ASSERT(&so->so_snd); } } /* * Create an external-format (``xsocket'') structure using the information in * the kernel-format socket structure pointed to by so. This is done to * reduce the spew of irrelevant information over this interface, to isolate * user code from changes in the kernel structure, and potentially to provide * information-hiding if we decide that some of this information should be * hidden from users. */ void sotoxsocket(struct socket *so, struct xsocket *xso) { bzero(xso, sizeof(*xso)); xso->xso_len = sizeof *xso; xso->xso_so = (uintptr_t)so; xso->so_type = so->so_type; xso->so_options = so->so_options; xso->so_linger = so->so_linger; xso->so_state = so->so_state; xso->so_pcb = (uintptr_t)so->so_pcb; xso->xso_protocol = so->so_proto->pr_protocol; xso->xso_family = so->so_proto->pr_domain->dom_family; xso->so_timeo = so->so_timeo; xso->so_error = so->so_error; xso->so_uid = so->so_cred->cr_uid; xso->so_pgid = so->so_sigio ? so->so_sigio->sio_pgid : 0; if (SOLISTENING(so)) { xso->so_qlen = so->sol_qlen; xso->so_incqlen = so->sol_incqlen; xso->so_qlimit = so->sol_qlimit; xso->so_oobmark = 0; } else { xso->so_state |= so->so_qstate; xso->so_qlen = xso->so_incqlen = xso->so_qlimit = 0; xso->so_oobmark = so->so_oobmark; sbtoxsockbuf(&so->so_snd, &xso->so_snd); sbtoxsockbuf(&so->so_rcv, &xso->so_rcv); } } struct sockbuf * so_sockbuf_rcv(struct socket *so) { return (&so->so_rcv); } struct sockbuf * so_sockbuf_snd(struct socket *so) { return (&so->so_snd); } int so_state_get(const struct socket *so) { return (so->so_state); } void so_state_set(struct socket *so, int val) { so->so_state = val; } int so_options_get(const struct socket *so) { return (so->so_options); } void so_options_set(struct socket *so, int val) { so->so_options = val; } int so_error_get(const struct socket *so) { return (so->so_error); } void so_error_set(struct socket *so, int val) { so->so_error = val; } int so_linger_get(const struct socket *so) { return (so->so_linger); } void so_linger_set(struct socket *so, int val) { KASSERT(val >= 0 && val <= USHRT_MAX && val <= (INT_MAX / hz), ("%s: val %d out of range", __func__, val)); so->so_linger = val; } struct protosw * so_protosw_get(const struct socket *so) { return (so->so_proto); } void so_protosw_set(struct socket *so, struct protosw *val) { so->so_proto = val; } void so_sorwakeup(struct socket *so) { sorwakeup(so); } void so_sowwakeup(struct socket *so) { sowwakeup(so); } void so_sorwakeup_locked(struct socket *so) { sorwakeup_locked(so); } void so_sowwakeup_locked(struct socket *so) { sowwakeup_locked(so); } void so_lock(struct socket *so) { SOCK_LOCK(so); } void so_unlock(struct socket *so) { SOCK_UNLOCK(so); } diff --git a/sys/kern/uipc_syscalls.c b/sys/kern/uipc_syscalls.c index 23fae343924a..18505b54d603 100644 --- a/sys/kern/uipc_syscalls.c +++ b/sys/kern/uipc_syscalls.c @@ -1,1632 +1,1632 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1989, 1990, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)uipc_syscalls.c 8.4 (Berkeley) 2/21/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_capsicum.h" #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ktrace.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KTRACE #include #endif #ifdef COMPAT_FREEBSD32 #include #endif #include #include #include static int sendit(struct thread *td, int s, struct msghdr *mp, int flags); static int recvit(struct thread *td, int s, struct msghdr *mp, void *namelenp); static int accept1(struct thread *td, int s, struct sockaddr *uname, socklen_t *anamelen, int flags); static int getsockname1(struct thread *td, struct getsockname_args *uap, int compat); static int getpeername1(struct thread *td, struct getpeername_args *uap, int compat); static int sockargs(struct mbuf **, char *, socklen_t, int); /* * Convert a user file descriptor to a kernel file entry and check if required * capability rights are present. * If required copy of current set of capability rights is returned. * A reference on the file entry is held upon returning. */ int getsock_cap(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp, u_int *fflagp, struct filecaps *havecapsp) { struct file *fp; int error; error = fget_cap(td, fd, rightsp, &fp, havecapsp); if (error != 0) return (error); if (fp->f_type != DTYPE_SOCKET) { fdrop(fp, td); if (havecapsp != NULL) filecaps_free(havecapsp); return (ENOTSOCK); } if (fflagp != NULL) *fflagp = fp->f_flag; *fpp = fp; return (0); } /* * System call interface to the socket abstraction. */ #if defined(COMPAT_43) #define COMPAT_OLDSOCK #endif int sys_socket(struct thread *td, struct socket_args *uap) { return (kern_socket(td, uap->domain, uap->type, uap->protocol)); } int kern_socket(struct thread *td, int domain, int type, int protocol) { struct socket *so; struct file *fp; int fd, error, oflag, fflag; AUDIT_ARG_SOCKET(domain, type, protocol); oflag = 0; fflag = 0; if ((type & SOCK_CLOEXEC) != 0) { type &= ~SOCK_CLOEXEC; oflag |= O_CLOEXEC; } if ((type & SOCK_NONBLOCK) != 0) { type &= ~SOCK_NONBLOCK; fflag |= FNONBLOCK; } #ifdef MAC error = mac_socket_check_create(td->td_ucred, domain, type, protocol); if (error != 0) return (error); #endif error = falloc(td, &fp, &fd, oflag); if (error != 0) return (error); /* An extra reference on `fp' has been held for us by falloc(). */ error = socreate(domain, &so, type, protocol, td->td_ucred, td); if (error != 0) { fdclose(td, fp, fd); } else { finit(fp, FREAD | FWRITE | fflag, DTYPE_SOCKET, so, &socketops); if ((fflag & FNONBLOCK) != 0) (void) fo_ioctl(fp, FIONBIO, &fflag, td->td_ucred, td); td->td_retval[0] = fd; } fdrop(fp, td); return (error); } int sys_bind(struct thread *td, struct bind_args *uap) { struct sockaddr *sa; int error; error = getsockaddr(&sa, uap->name, uap->namelen); if (error == 0) { error = kern_bindat(td, AT_FDCWD, uap->s, sa); free(sa, M_SONAME); } return (error); } int kern_bindat(struct thread *td, int dirfd, int fd, struct sockaddr *sa) { struct socket *so; struct file *fp; int error; #ifdef CAPABILITY_MODE if (IN_CAPABILITY_MODE(td) && (dirfd == AT_FDCWD)) return (ECAPMODE); #endif AUDIT_ARG_FD(fd); AUDIT_ARG_SOCKADDR(td, dirfd, sa); error = getsock_cap(td, fd, &cap_bind_rights, &fp, NULL, NULL); if (error != 0) return (error); so = fp->f_data; #ifdef KTRACE if (KTRPOINT(td, KTR_STRUCT)) ktrsockaddr(sa); #endif #ifdef MAC error = mac_socket_check_bind(td->td_ucred, so, sa); if (error == 0) { #endif if (dirfd == AT_FDCWD) error = sobind(so, sa, td); else error = sobindat(dirfd, so, sa, td); #ifdef MAC } #endif fdrop(fp, td); return (error); } int sys_bindat(struct thread *td, struct bindat_args *uap) { struct sockaddr *sa; int error; error = getsockaddr(&sa, uap->name, uap->namelen); if (error == 0) { error = kern_bindat(td, uap->fd, uap->s, sa); free(sa, M_SONAME); } return (error); } int sys_listen(struct thread *td, struct listen_args *uap) { return (kern_listen(td, uap->s, uap->backlog)); } int kern_listen(struct thread *td, int s, int backlog) { struct socket *so; struct file *fp; int error; AUDIT_ARG_FD(s); error = getsock_cap(td, s, &cap_listen_rights, &fp, NULL, NULL); if (error == 0) { so = fp->f_data; #ifdef MAC error = mac_socket_check_listen(td->td_ucred, so); if (error == 0) #endif error = solisten(so, backlog, td); fdrop(fp, td); } return (error); } /* * accept1() */ static int accept1(td, s, uname, anamelen, flags) struct thread *td; int s; struct sockaddr *uname; socklen_t *anamelen; int flags; { struct sockaddr *name; socklen_t namelen; struct file *fp; int error; if (uname == NULL) return (kern_accept4(td, s, NULL, NULL, flags, NULL)); error = copyin(anamelen, &namelen, sizeof (namelen)); if (error != 0) return (error); error = kern_accept4(td, s, &name, &namelen, flags, &fp); if (error != 0) return (error); if (error == 0 && uname != NULL) { #ifdef COMPAT_OLDSOCK if (SV_PROC_FLAG(td->td_proc, SV_AOUT) && (flags & ACCEPT4_COMPAT) != 0) ((struct osockaddr *)name)->sa_family = name->sa_family; #endif error = copyout(name, uname, namelen); } if (error == 0) error = copyout(&namelen, anamelen, sizeof(namelen)); if (error != 0) fdclose(td, fp, td->td_retval[0]); fdrop(fp, td); free(name, M_SONAME); return (error); } int kern_accept(struct thread *td, int s, struct sockaddr **name, socklen_t *namelen, struct file **fp) { return (kern_accept4(td, s, name, namelen, ACCEPT4_INHERIT, fp)); } int kern_accept4(struct thread *td, int s, struct sockaddr **name, socklen_t *namelen, int flags, struct file **fp) { struct file *headfp, *nfp = NULL; struct sockaddr *sa = NULL; struct socket *head, *so; struct filecaps fcaps; u_int fflag; pid_t pgid; int error, fd, tmp; if (name != NULL) *name = NULL; AUDIT_ARG_FD(s); error = getsock_cap(td, s, &cap_accept_rights, &headfp, &fflag, &fcaps); if (error != 0) return (error); head = headfp->f_data; - if ((head->so_options & SO_ACCEPTCONN) == 0) { + if (!SOLISTENING(head)) { error = EINVAL; goto done; } #ifdef MAC error = mac_socket_check_accept(td->td_ucred, head); if (error != 0) goto done; #endif error = falloc_caps(td, &nfp, &fd, (flags & SOCK_CLOEXEC) ? O_CLOEXEC : 0, &fcaps); if (error != 0) goto done; SOCK_LOCK(head); if (!SOLISTENING(head)) { SOCK_UNLOCK(head); error = EINVAL; goto noconnection; } error = solisten_dequeue(head, &so, flags); if (error != 0) goto noconnection; /* An extra reference on `nfp' has been held for us by falloc(). */ td->td_retval[0] = fd; /* Connection has been removed from the listen queue. */ KNOTE_UNLOCKED(&head->so_rdsel.si_note, 0); if (flags & ACCEPT4_INHERIT) { pgid = fgetown(&head->so_sigio); if (pgid != 0) fsetown(pgid, &so->so_sigio); } else { fflag &= ~(FNONBLOCK | FASYNC); if (flags & SOCK_NONBLOCK) fflag |= FNONBLOCK; } finit(nfp, fflag, DTYPE_SOCKET, so, &socketops); /* Sync socket nonblocking/async state with file flags */ tmp = fflag & FNONBLOCK; (void) fo_ioctl(nfp, FIONBIO, &tmp, td->td_ucred, td); tmp = fflag & FASYNC; (void) fo_ioctl(nfp, FIOASYNC, &tmp, td->td_ucred, td); error = soaccept(so, &sa); if (error != 0) goto noconnection; if (sa == NULL) { if (name) *namelen = 0; goto done; } AUDIT_ARG_SOCKADDR(td, AT_FDCWD, sa); if (name) { /* check sa_len before it is destroyed */ if (*namelen > sa->sa_len) *namelen = sa->sa_len; #ifdef KTRACE if (KTRPOINT(td, KTR_STRUCT)) ktrsockaddr(sa); #endif *name = sa; sa = NULL; } noconnection: free(sa, M_SONAME); /* * close the new descriptor, assuming someone hasn't ripped it * out from under us. */ if (error != 0) fdclose(td, nfp, fd); /* * Release explicitly held references before returning. We return * a reference on nfp to the caller on success if they request it. */ done: if (nfp == NULL) filecaps_free(&fcaps); if (fp != NULL) { if (error == 0) { *fp = nfp; nfp = NULL; } else *fp = NULL; } if (nfp != NULL) fdrop(nfp, td); fdrop(headfp, td); return (error); } int sys_accept(td, uap) struct thread *td; struct accept_args *uap; { return (accept1(td, uap->s, uap->name, uap->anamelen, ACCEPT4_INHERIT)); } int sys_accept4(td, uap) struct thread *td; struct accept4_args *uap; { if (uap->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK)) return (EINVAL); return (accept1(td, uap->s, uap->name, uap->anamelen, uap->flags)); } #ifdef COMPAT_OLDSOCK int oaccept(struct thread *td, struct oaccept_args *uap) { return (accept1(td, uap->s, uap->name, uap->anamelen, ACCEPT4_INHERIT | ACCEPT4_COMPAT)); } #endif /* COMPAT_OLDSOCK */ int sys_connect(struct thread *td, struct connect_args *uap) { struct sockaddr *sa; int error; error = getsockaddr(&sa, uap->name, uap->namelen); if (error == 0) { error = kern_connectat(td, AT_FDCWD, uap->s, sa); free(sa, M_SONAME); } return (error); } int kern_connectat(struct thread *td, int dirfd, int fd, struct sockaddr *sa) { struct socket *so; struct file *fp; int error, interrupted = 0; #ifdef CAPABILITY_MODE if (IN_CAPABILITY_MODE(td) && (dirfd == AT_FDCWD)) return (ECAPMODE); #endif AUDIT_ARG_FD(fd); AUDIT_ARG_SOCKADDR(td, dirfd, sa); error = getsock_cap(td, fd, &cap_connect_rights, &fp, NULL, NULL); if (error != 0) return (error); so = fp->f_data; if (so->so_state & SS_ISCONNECTING) { error = EALREADY; goto done1; } #ifdef KTRACE if (KTRPOINT(td, KTR_STRUCT)) ktrsockaddr(sa); #endif #ifdef MAC error = mac_socket_check_connect(td->td_ucred, so, sa); if (error != 0) goto bad; #endif if (dirfd == AT_FDCWD) error = soconnect(so, sa, td); else error = soconnectat(dirfd, so, sa, td); if (error != 0) goto bad; if ((so->so_state & SS_NBIO) && (so->so_state & SS_ISCONNECTING)) { error = EINPROGRESS; goto done1; } SOCK_LOCK(so); while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) { error = msleep(&so->so_timeo, &so->so_lock, PSOCK | PCATCH, "connec", 0); if (error != 0) { if (error == EINTR || error == ERESTART) interrupted = 1; break; } } if (error == 0) { error = so->so_error; so->so_error = 0; } SOCK_UNLOCK(so); bad: if (!interrupted) so->so_state &= ~SS_ISCONNECTING; if (error == ERESTART) error = EINTR; done1: fdrop(fp, td); return (error); } int sys_connectat(struct thread *td, struct connectat_args *uap) { struct sockaddr *sa; int error; error = getsockaddr(&sa, uap->name, uap->namelen); if (error == 0) { error = kern_connectat(td, uap->fd, uap->s, sa); free(sa, M_SONAME); } return (error); } int kern_socketpair(struct thread *td, int domain, int type, int protocol, int *rsv) { struct file *fp1, *fp2; struct socket *so1, *so2; int fd, error, oflag, fflag; AUDIT_ARG_SOCKET(domain, type, protocol); oflag = 0; fflag = 0; if ((type & SOCK_CLOEXEC) != 0) { type &= ~SOCK_CLOEXEC; oflag |= O_CLOEXEC; } if ((type & SOCK_NONBLOCK) != 0) { type &= ~SOCK_NONBLOCK; fflag |= FNONBLOCK; } #ifdef MAC /* We might want to have a separate check for socket pairs. */ error = mac_socket_check_create(td->td_ucred, domain, type, protocol); if (error != 0) return (error); #endif error = socreate(domain, &so1, type, protocol, td->td_ucred, td); if (error != 0) return (error); error = socreate(domain, &so2, type, protocol, td->td_ucred, td); if (error != 0) goto free1; /* On success extra reference to `fp1' and 'fp2' is set by falloc. */ error = falloc(td, &fp1, &fd, oflag); if (error != 0) goto free2; rsv[0] = fd; fp1->f_data = so1; /* so1 already has ref count */ error = falloc(td, &fp2, &fd, oflag); if (error != 0) goto free3; fp2->f_data = so2; /* so2 already has ref count */ rsv[1] = fd; error = soconnect2(so1, so2); if (error != 0) goto free4; if (type == SOCK_DGRAM) { /* * Datagram socket connection is asymmetric. */ error = soconnect2(so2, so1); if (error != 0) goto free4; } else if (so1->so_proto->pr_flags & PR_CONNREQUIRED) { struct unpcb *unp, *unp2; unp = sotounpcb(so1); unp2 = sotounpcb(so2); /* * No need to lock the unps, because the sockets are brand-new. * No other threads can be using them yet */ unp_copy_peercred(td, unp, unp2, unp); } finit(fp1, FREAD | FWRITE | fflag, DTYPE_SOCKET, fp1->f_data, &socketops); finit(fp2, FREAD | FWRITE | fflag, DTYPE_SOCKET, fp2->f_data, &socketops); if ((fflag & FNONBLOCK) != 0) { (void) fo_ioctl(fp1, FIONBIO, &fflag, td->td_ucred, td); (void) fo_ioctl(fp2, FIONBIO, &fflag, td->td_ucred, td); } fdrop(fp1, td); fdrop(fp2, td); return (0); free4: fdclose(td, fp2, rsv[1]); fdrop(fp2, td); free3: fdclose(td, fp1, rsv[0]); fdrop(fp1, td); free2: if (so2 != NULL) (void)soclose(so2); free1: if (so1 != NULL) (void)soclose(so1); return (error); } int sys_socketpair(struct thread *td, struct socketpair_args *uap) { int error, sv[2]; error = kern_socketpair(td, uap->domain, uap->type, uap->protocol, sv); if (error != 0) return (error); error = copyout(sv, uap->rsv, 2 * sizeof(int)); if (error != 0) { (void)kern_close(td, sv[0]); (void)kern_close(td, sv[1]); } return (error); } static int sendit(struct thread *td, int s, struct msghdr *mp, int flags) { struct mbuf *control; struct sockaddr *to; int error; #ifdef CAPABILITY_MODE if (IN_CAPABILITY_MODE(td) && (mp->msg_name != NULL)) return (ECAPMODE); #endif if (mp->msg_name != NULL) { error = getsockaddr(&to, mp->msg_name, mp->msg_namelen); if (error != 0) { to = NULL; goto bad; } mp->msg_name = to; } else { to = NULL; } if (mp->msg_control) { if (mp->msg_controllen < sizeof(struct cmsghdr) #ifdef COMPAT_OLDSOCK && (mp->msg_flags != MSG_COMPAT || !SV_PROC_FLAG(td->td_proc, SV_AOUT)) #endif ) { error = EINVAL; goto bad; } error = sockargs(&control, mp->msg_control, mp->msg_controllen, MT_CONTROL); if (error != 0) goto bad; #ifdef COMPAT_OLDSOCK if (mp->msg_flags == MSG_COMPAT && SV_PROC_FLAG(td->td_proc, SV_AOUT)) { struct cmsghdr *cm; M_PREPEND(control, sizeof(*cm), M_WAITOK); cm = mtod(control, struct cmsghdr *); cm->cmsg_len = control->m_len; cm->cmsg_level = SOL_SOCKET; cm->cmsg_type = SCM_RIGHTS; } #endif } else { control = NULL; } error = kern_sendit(td, s, mp, flags, control, UIO_USERSPACE); bad: free(to, M_SONAME); return (error); } int kern_sendit(struct thread *td, int s, struct msghdr *mp, int flags, struct mbuf *control, enum uio_seg segflg) { struct file *fp; struct uio auio; struct iovec *iov; struct socket *so; cap_rights_t *rights; #ifdef KTRACE struct uio *ktruio = NULL; #endif ssize_t len; int i, error; AUDIT_ARG_FD(s); rights = &cap_send_rights; if (mp->msg_name != NULL) { AUDIT_ARG_SOCKADDR(td, AT_FDCWD, mp->msg_name); rights = &cap_send_connect_rights; } error = getsock_cap(td, s, rights, &fp, NULL, NULL); if (error != 0) { m_freem(control); return (error); } so = (struct socket *)fp->f_data; #ifdef KTRACE if (mp->msg_name != NULL && KTRPOINT(td, KTR_STRUCT)) ktrsockaddr(mp->msg_name); #endif #ifdef MAC if (mp->msg_name != NULL) { error = mac_socket_check_connect(td->td_ucred, so, mp->msg_name); if (error != 0) { m_freem(control); goto bad; } } error = mac_socket_check_send(td->td_ucred, so); if (error != 0) { m_freem(control); goto bad; } #endif auio.uio_iov = mp->msg_iov; auio.uio_iovcnt = mp->msg_iovlen; auio.uio_segflg = segflg; auio.uio_rw = UIO_WRITE; auio.uio_td = td; auio.uio_offset = 0; /* XXX */ auio.uio_resid = 0; iov = mp->msg_iov; for (i = 0; i < mp->msg_iovlen; i++, iov++) { if ((auio.uio_resid += iov->iov_len) < 0) { error = EINVAL; m_freem(control); goto bad; } } #ifdef KTRACE if (KTRPOINT(td, KTR_GENIO)) ktruio = cloneuio(&auio); #endif len = auio.uio_resid; error = sosend(so, mp->msg_name, &auio, 0, control, flags, td); if (error != 0) { if (auio.uio_resid != len && (error == ERESTART || error == EINTR || error == EWOULDBLOCK)) error = 0; /* Generation of SIGPIPE can be controlled per socket */ if (error == EPIPE && !(so->so_options & SO_NOSIGPIPE) && !(flags & MSG_NOSIGNAL)) { PROC_LOCK(td->td_proc); tdsignal(td, SIGPIPE); PROC_UNLOCK(td->td_proc); } } if (error == 0) td->td_retval[0] = len - auio.uio_resid; #ifdef KTRACE if (ktruio != NULL) { ktruio->uio_resid = td->td_retval[0]; ktrgenio(s, UIO_WRITE, ktruio, error); } #endif bad: fdrop(fp, td); return (error); } int sys_sendto(struct thread *td, struct sendto_args *uap) { struct msghdr msg; struct iovec aiov; msg.msg_name = __DECONST(void *, uap->to); msg.msg_namelen = uap->tolen; msg.msg_iov = &aiov; msg.msg_iovlen = 1; msg.msg_control = 0; #ifdef COMPAT_OLDSOCK if (SV_PROC_FLAG(td->td_proc, SV_AOUT)) msg.msg_flags = 0; #endif aiov.iov_base = __DECONST(void *, uap->buf); aiov.iov_len = uap->len; return (sendit(td, uap->s, &msg, uap->flags)); } #ifdef COMPAT_OLDSOCK int osend(struct thread *td, struct osend_args *uap) { struct msghdr msg; struct iovec aiov; msg.msg_name = 0; msg.msg_namelen = 0; msg.msg_iov = &aiov; msg.msg_iovlen = 1; aiov.iov_base = __DECONST(void *, uap->buf); aiov.iov_len = uap->len; msg.msg_control = 0; msg.msg_flags = 0; return (sendit(td, uap->s, &msg, uap->flags)); } int osendmsg(struct thread *td, struct osendmsg_args *uap) { struct msghdr msg; struct iovec *iov; int error; error = copyin(uap->msg, &msg, sizeof (struct omsghdr)); if (error != 0) return (error); error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE); if (error != 0) return (error); msg.msg_iov = iov; msg.msg_flags = MSG_COMPAT; error = sendit(td, uap->s, &msg, uap->flags); free(iov, M_IOV); return (error); } #endif int sys_sendmsg(struct thread *td, struct sendmsg_args *uap) { struct msghdr msg; struct iovec *iov; int error; error = copyin(uap->msg, &msg, sizeof (msg)); if (error != 0) return (error); error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE); if (error != 0) return (error); msg.msg_iov = iov; #ifdef COMPAT_OLDSOCK if (SV_PROC_FLAG(td->td_proc, SV_AOUT)) msg.msg_flags = 0; #endif error = sendit(td, uap->s, &msg, uap->flags); free(iov, M_IOV); return (error); } int kern_recvit(struct thread *td, int s, struct msghdr *mp, enum uio_seg fromseg, struct mbuf **controlp) { struct uio auio; struct iovec *iov; struct mbuf *control, *m; caddr_t ctlbuf; struct file *fp; struct socket *so; struct sockaddr *fromsa = NULL; #ifdef KTRACE struct uio *ktruio = NULL; #endif ssize_t len; int error, i; if (controlp != NULL) *controlp = NULL; AUDIT_ARG_FD(s); error = getsock_cap(td, s, &cap_recv_rights, &fp, NULL, NULL); if (error != 0) return (error); so = fp->f_data; #ifdef MAC error = mac_socket_check_receive(td->td_ucred, so); if (error != 0) { fdrop(fp, td); return (error); } #endif auio.uio_iov = mp->msg_iov; auio.uio_iovcnt = mp->msg_iovlen; auio.uio_segflg = UIO_USERSPACE; auio.uio_rw = UIO_READ; auio.uio_td = td; auio.uio_offset = 0; /* XXX */ auio.uio_resid = 0; iov = mp->msg_iov; for (i = 0; i < mp->msg_iovlen; i++, iov++) { if ((auio.uio_resid += iov->iov_len) < 0) { fdrop(fp, td); return (EINVAL); } } #ifdef KTRACE if (KTRPOINT(td, KTR_GENIO)) ktruio = cloneuio(&auio); #endif control = NULL; len = auio.uio_resid; error = soreceive(so, &fromsa, &auio, NULL, (mp->msg_control || controlp) ? &control : NULL, &mp->msg_flags); if (error != 0) { if (auio.uio_resid != len && (error == ERESTART || error == EINTR || error == EWOULDBLOCK)) error = 0; } if (fromsa != NULL) AUDIT_ARG_SOCKADDR(td, AT_FDCWD, fromsa); #ifdef KTRACE if (ktruio != NULL) { ktruio->uio_resid = len - auio.uio_resid; ktrgenio(s, UIO_READ, ktruio, error); } #endif if (error != 0) goto out; td->td_retval[0] = len - auio.uio_resid; if (mp->msg_name) { len = mp->msg_namelen; if (len <= 0 || fromsa == NULL) len = 0; else { /* save sa_len before it is destroyed by MSG_COMPAT */ len = MIN(len, fromsa->sa_len); #ifdef COMPAT_OLDSOCK if ((mp->msg_flags & MSG_COMPAT) != 0 && SV_PROC_FLAG(td->td_proc, SV_AOUT)) ((struct osockaddr *)fromsa)->sa_family = fromsa->sa_family; #endif if (fromseg == UIO_USERSPACE) { error = copyout(fromsa, mp->msg_name, (unsigned)len); if (error != 0) goto out; } else bcopy(fromsa, mp->msg_name, len); } mp->msg_namelen = len; } if (mp->msg_control && controlp == NULL) { #ifdef COMPAT_OLDSOCK /* * We assume that old recvmsg calls won't receive access * rights and other control info, esp. as control info * is always optional and those options didn't exist in 4.3. * If we receive rights, trim the cmsghdr; anything else * is tossed. */ if (control && (mp->msg_flags & MSG_COMPAT) != 0 && SV_PROC_FLAG(td->td_proc, SV_AOUT)) { if (mtod(control, struct cmsghdr *)->cmsg_level != SOL_SOCKET || mtod(control, struct cmsghdr *)->cmsg_type != SCM_RIGHTS) { mp->msg_controllen = 0; goto out; } control->m_len -= sizeof (struct cmsghdr); control->m_data += sizeof (struct cmsghdr); } #endif ctlbuf = mp->msg_control; len = mp->msg_controllen; mp->msg_controllen = 0; for (m = control; m != NULL && len >= m->m_len; m = m->m_next) { if ((error = copyout(mtod(m, caddr_t), ctlbuf, m->m_len)) != 0) goto out; ctlbuf += m->m_len; len -= m->m_len; mp->msg_controllen += m->m_len; } if (m != NULL) { mp->msg_flags |= MSG_CTRUNC; m_dispose_extcontrolm(m); } } out: fdrop(fp, td); #ifdef KTRACE if (fromsa && KTRPOINT(td, KTR_STRUCT)) ktrsockaddr(fromsa); #endif free(fromsa, M_SONAME); if (error == 0 && controlp != NULL) *controlp = control; else if (control != NULL) { if (error != 0) m_dispose_extcontrolm(control); m_freem(control); } return (error); } static int recvit(struct thread *td, int s, struct msghdr *mp, void *namelenp) { int error; error = kern_recvit(td, s, mp, UIO_USERSPACE, NULL); if (error != 0) return (error); if (namelenp != NULL) { error = copyout(&mp->msg_namelen, namelenp, sizeof (socklen_t)); #ifdef COMPAT_OLDSOCK if ((mp->msg_flags & MSG_COMPAT) != 0 && SV_PROC_FLAG(td->td_proc, SV_AOUT)) error = 0; /* old recvfrom didn't check */ #endif } return (error); } int sys_recvfrom(struct thread *td, struct recvfrom_args *uap) { struct msghdr msg; struct iovec aiov; int error; if (uap->fromlenaddr) { error = copyin(uap->fromlenaddr, &msg.msg_namelen, sizeof (msg.msg_namelen)); if (error != 0) goto done2; } else { msg.msg_namelen = 0; } msg.msg_name = uap->from; msg.msg_iov = &aiov; msg.msg_iovlen = 1; aiov.iov_base = uap->buf; aiov.iov_len = uap->len; msg.msg_control = 0; msg.msg_flags = uap->flags; error = recvit(td, uap->s, &msg, uap->fromlenaddr); done2: return (error); } #ifdef COMPAT_OLDSOCK int orecvfrom(struct thread *td, struct recvfrom_args *uap) { uap->flags |= MSG_COMPAT; return (sys_recvfrom(td, uap)); } #endif #ifdef COMPAT_OLDSOCK int orecv(struct thread *td, struct orecv_args *uap) { struct msghdr msg; struct iovec aiov; msg.msg_name = 0; msg.msg_namelen = 0; msg.msg_iov = &aiov; msg.msg_iovlen = 1; aiov.iov_base = uap->buf; aiov.iov_len = uap->len; msg.msg_control = 0; msg.msg_flags = uap->flags; return (recvit(td, uap->s, &msg, NULL)); } /* * Old recvmsg. This code takes advantage of the fact that the old msghdr * overlays the new one, missing only the flags, and with the (old) access * rights where the control fields are now. */ int orecvmsg(struct thread *td, struct orecvmsg_args *uap) { struct msghdr msg; struct iovec *iov; int error; error = copyin(uap->msg, &msg, sizeof (struct omsghdr)); if (error != 0) return (error); error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE); if (error != 0) return (error); msg.msg_flags = uap->flags | MSG_COMPAT; msg.msg_iov = iov; error = recvit(td, uap->s, &msg, &uap->msg->msg_namelen); if (msg.msg_controllen && error == 0) error = copyout(&msg.msg_controllen, &uap->msg->msg_accrightslen, sizeof (int)); free(iov, M_IOV); return (error); } #endif int sys_recvmsg(struct thread *td, struct recvmsg_args *uap) { struct msghdr msg; struct iovec *uiov, *iov; int error; error = copyin(uap->msg, &msg, sizeof (msg)); if (error != 0) return (error); error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE); if (error != 0) return (error); msg.msg_flags = uap->flags; #ifdef COMPAT_OLDSOCK if (SV_PROC_FLAG(td->td_proc, SV_AOUT)) msg.msg_flags &= ~MSG_COMPAT; #endif uiov = msg.msg_iov; msg.msg_iov = iov; error = recvit(td, uap->s, &msg, NULL); if (error == 0) { msg.msg_iov = uiov; error = copyout(&msg, uap->msg, sizeof(msg)); } free(iov, M_IOV); return (error); } int sys_shutdown(struct thread *td, struct shutdown_args *uap) { return (kern_shutdown(td, uap->s, uap->how)); } int kern_shutdown(struct thread *td, int s, int how) { struct socket *so; struct file *fp; int error; AUDIT_ARG_FD(s); error = getsock_cap(td, s, &cap_shutdown_rights, &fp, NULL, NULL); if (error == 0) { so = fp->f_data; error = soshutdown(so, how); /* * Previous versions did not return ENOTCONN, but 0 in * case the socket was not connected. Some important * programs like syslogd up to r279016, 2015-02-19, * still depend on this behavior. */ if (error == ENOTCONN && td->td_proc->p_osrel < P_OSREL_SHUTDOWN_ENOTCONN) error = 0; fdrop(fp, td); } return (error); } int sys_setsockopt(struct thread *td, struct setsockopt_args *uap) { return (kern_setsockopt(td, uap->s, uap->level, uap->name, uap->val, UIO_USERSPACE, uap->valsize)); } int kern_setsockopt(struct thread *td, int s, int level, int name, const void *val, enum uio_seg valseg, socklen_t valsize) { struct socket *so; struct file *fp; struct sockopt sopt; int error; if (val == NULL && valsize != 0) return (EFAULT); if ((int)valsize < 0) return (EINVAL); sopt.sopt_dir = SOPT_SET; sopt.sopt_level = level; sopt.sopt_name = name; sopt.sopt_val = __DECONST(void *, val); sopt.sopt_valsize = valsize; switch (valseg) { case UIO_USERSPACE: sopt.sopt_td = td; break; case UIO_SYSSPACE: sopt.sopt_td = NULL; break; default: panic("kern_setsockopt called with bad valseg"); } AUDIT_ARG_FD(s); error = getsock_cap(td, s, &cap_setsockopt_rights, &fp, NULL, NULL); if (error == 0) { so = fp->f_data; error = sosetopt(so, &sopt); fdrop(fp, td); } return(error); } int sys_getsockopt(struct thread *td, struct getsockopt_args *uap) { socklen_t valsize; int error; if (uap->val) { error = copyin(uap->avalsize, &valsize, sizeof (valsize)); if (error != 0) return (error); } error = kern_getsockopt(td, uap->s, uap->level, uap->name, uap->val, UIO_USERSPACE, &valsize); if (error == 0) error = copyout(&valsize, uap->avalsize, sizeof (valsize)); return (error); } /* * Kernel version of getsockopt. * optval can be a userland or userspace. optlen is always a kernel pointer. */ int kern_getsockopt(struct thread *td, int s, int level, int name, void *val, enum uio_seg valseg, socklen_t *valsize) { struct socket *so; struct file *fp; struct sockopt sopt; int error; if (val == NULL) *valsize = 0; if ((int)*valsize < 0) return (EINVAL); sopt.sopt_dir = SOPT_GET; sopt.sopt_level = level; sopt.sopt_name = name; sopt.sopt_val = val; sopt.sopt_valsize = (size_t)*valsize; /* checked non-negative above */ switch (valseg) { case UIO_USERSPACE: sopt.sopt_td = td; break; case UIO_SYSSPACE: sopt.sopt_td = NULL; break; default: panic("kern_getsockopt called with bad valseg"); } AUDIT_ARG_FD(s); error = getsock_cap(td, s, &cap_getsockopt_rights, &fp, NULL, NULL); if (error == 0) { so = fp->f_data; error = sogetopt(so, &sopt); *valsize = sopt.sopt_valsize; fdrop(fp, td); } return (error); } /* * getsockname1() - Get socket name. */ static int getsockname1(struct thread *td, struct getsockname_args *uap, int compat) { struct sockaddr *sa; socklen_t len; int error; error = copyin(uap->alen, &len, sizeof(len)); if (error != 0) return (error); error = kern_getsockname(td, uap->fdes, &sa, &len); if (error != 0) return (error); if (len != 0) { #ifdef COMPAT_OLDSOCK if (compat && SV_PROC_FLAG(td->td_proc, SV_AOUT)) ((struct osockaddr *)sa)->sa_family = sa->sa_family; #endif error = copyout(sa, uap->asa, (u_int)len); } free(sa, M_SONAME); if (error == 0) error = copyout(&len, uap->alen, sizeof(len)); return (error); } int kern_getsockname(struct thread *td, int fd, struct sockaddr **sa, socklen_t *alen) { struct socket *so; struct file *fp; socklen_t len; int error; AUDIT_ARG_FD(fd); error = getsock_cap(td, fd, &cap_getsockname_rights, &fp, NULL, NULL); if (error != 0) return (error); so = fp->f_data; *sa = NULL; CURVNET_SET(so->so_vnet); error = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, sa); CURVNET_RESTORE(); if (error != 0) goto bad; if (*sa == NULL) len = 0; else len = MIN(*alen, (*sa)->sa_len); *alen = len; #ifdef KTRACE if (KTRPOINT(td, KTR_STRUCT)) ktrsockaddr(*sa); #endif bad: fdrop(fp, td); if (error != 0 && *sa != NULL) { free(*sa, M_SONAME); *sa = NULL; } return (error); } int sys_getsockname(struct thread *td, struct getsockname_args *uap) { return (getsockname1(td, uap, 0)); } #ifdef COMPAT_OLDSOCK int ogetsockname(struct thread *td, struct getsockname_args *uap) { return (getsockname1(td, uap, 1)); } #endif /* COMPAT_OLDSOCK */ /* * getpeername1() - Get name of peer for connected socket. */ static int getpeername1(struct thread *td, struct getpeername_args *uap, int compat) { struct sockaddr *sa; socklen_t len; int error; error = copyin(uap->alen, &len, sizeof (len)); if (error != 0) return (error); error = kern_getpeername(td, uap->fdes, &sa, &len); if (error != 0) return (error); if (len != 0) { #ifdef COMPAT_OLDSOCK if (compat && SV_PROC_FLAG(td->td_proc, SV_AOUT)) ((struct osockaddr *)sa)->sa_family = sa->sa_family; #endif error = copyout(sa, uap->asa, (u_int)len); } free(sa, M_SONAME); if (error == 0) error = copyout(&len, uap->alen, sizeof(len)); return (error); } int kern_getpeername(struct thread *td, int fd, struct sockaddr **sa, socklen_t *alen) { struct socket *so; struct file *fp; socklen_t len; int error; AUDIT_ARG_FD(fd); error = getsock_cap(td, fd, &cap_getpeername_rights, &fp, NULL, NULL); if (error != 0) return (error); so = fp->f_data; if ((so->so_state & (SS_ISCONNECTED|SS_ISCONFIRMING)) == 0) { error = ENOTCONN; goto done; } *sa = NULL; CURVNET_SET(so->so_vnet); error = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so, sa); CURVNET_RESTORE(); if (error != 0) goto bad; if (*sa == NULL) len = 0; else len = MIN(*alen, (*sa)->sa_len); *alen = len; #ifdef KTRACE if (KTRPOINT(td, KTR_STRUCT)) ktrsockaddr(*sa); #endif bad: if (error != 0 && *sa != NULL) { free(*sa, M_SONAME); *sa = NULL; } done: fdrop(fp, td); return (error); } int sys_getpeername(struct thread *td, struct getpeername_args *uap) { return (getpeername1(td, uap, 0)); } #ifdef COMPAT_OLDSOCK int ogetpeername(struct thread *td, struct ogetpeername_args *uap) { /* XXX uap should have type `getpeername_args *' to begin with. */ return (getpeername1(td, (struct getpeername_args *)uap, 1)); } #endif /* COMPAT_OLDSOCK */ static int sockargs(struct mbuf **mp, char *buf, socklen_t buflen, int type) { struct sockaddr *sa; struct mbuf *m; int error; if (buflen > MLEN) { #ifdef COMPAT_OLDSOCK if (type == MT_SONAME && buflen <= 112 && SV_CURPROC_FLAG(SV_AOUT)) buflen = MLEN; /* unix domain compat. hack */ else #endif if (buflen > MCLBYTES) return (EINVAL); } m = m_get2(buflen, M_WAITOK, type, 0); m->m_len = buflen; error = copyin(buf, mtod(m, void *), buflen); if (error != 0) (void) m_free(m); else { *mp = m; if (type == MT_SONAME) { sa = mtod(m, struct sockaddr *); #if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN if (sa->sa_family == 0 && sa->sa_len < AF_MAX && SV_CURPROC_FLAG(SV_AOUT)) sa->sa_family = sa->sa_len; #endif sa->sa_len = buflen; } } return (error); } int getsockaddr(struct sockaddr **namp, const struct sockaddr *uaddr, size_t len) { struct sockaddr *sa; int error; if (len > SOCK_MAXADDRLEN) return (ENAMETOOLONG); if (len < offsetof(struct sockaddr, sa_data[0])) return (EINVAL); sa = malloc(len, M_SONAME, M_WAITOK); error = copyin(uaddr, sa, len); if (error != 0) { free(sa, M_SONAME); } else { #if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN if (sa->sa_family == 0 && sa->sa_len < AF_MAX && SV_CURPROC_FLAG(SV_AOUT)) sa->sa_family = sa->sa_len; #endif sa->sa_len = len; *namp = sa; } return (error); } /* * Dispose of externalized rights from an SCM_RIGHTS message. This function * should be used in error or truncation cases to avoid leaking file descriptors * into the recipient's (the current thread's) table. */ void m_dispose_extcontrolm(struct mbuf *m) { struct cmsghdr *cm; struct file *fp; struct thread *td; socklen_t clen, datalen; int error, fd, *fds, nfd; td = curthread; for (; m != NULL; m = m->m_next) { if (m->m_type != MT_EXTCONTROL) continue; cm = mtod(m, struct cmsghdr *); clen = m->m_len; while (clen > 0) { if (clen < sizeof(*cm)) panic("%s: truncated mbuf %p", __func__, m); datalen = CMSG_SPACE(cm->cmsg_len - CMSG_SPACE(0)); if (clen < datalen) panic("%s: truncated mbuf %p", __func__, m); if (cm->cmsg_level == SOL_SOCKET && cm->cmsg_type == SCM_RIGHTS) { fds = (int *)CMSG_DATA(cm); nfd = (cm->cmsg_len - CMSG_SPACE(0)) / sizeof(int); while (nfd-- > 0) { fd = *fds++; error = fget(td, fd, &cap_no_rights, &fp); if (error == 0) { fdclose(td, fp, fd); fdrop(fp, td); } } } clen -= datalen; cm = (struct cmsghdr *)((uint8_t *)cm + datalen); } m_chtype(m, MT_CONTROL); } } diff --git a/sys/kern/uipc_usrreq.c b/sys/kern/uipc_usrreq.c index 3f7198c2f3ae..eada98b48a1e 100644 --- a/sys/kern/uipc_usrreq.c +++ b/sys/kern/uipc_usrreq.c @@ -1,2996 +1,2996 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1989, 1991, 1993 * The Regents of the University of California. All Rights Reserved. * Copyright (c) 2004-2009 Robert N. M. Watson All Rights Reserved. * Copyright (c) 2018 Matthew Macy * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * From: @(#)uipc_usrreq.c 8.3 (Berkeley) 1/4/94 */ /* * UNIX Domain (Local) Sockets * * This is an implementation of UNIX (local) domain sockets. Each socket has * an associated struct unpcb (UNIX protocol control block). Stream sockets * may be connected to 0 or 1 other socket. Datagram sockets may be * connected to 0, 1, or many other sockets. Sockets may be created and * connected in pairs (socketpair(2)), or bound/connected to using the file * system name space. For most purposes, only the receive socket buffer is * used, as sending on one socket delivers directly to the receive socket * buffer of a second socket. * * The implementation is substantially complicated by the fact that * "ancillary data", such as file descriptors or credentials, may be passed * across UNIX domain sockets. The potential for passing UNIX domain sockets * over other UNIX domain sockets requires the implementation of a simple * garbage collector to find and tear down cycles of disconnected sockets. * * TODO: * RDM * rethink name space problems * need a proper out-of-band */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #endif #include #include MALLOC_DECLARE(M_FILECAPS); /* * See unpcb.h for the locking key. */ static uma_zone_t unp_zone; static unp_gen_t unp_gencnt; /* (l) */ static u_int unp_count; /* (l) Count of local sockets. */ static ino_t unp_ino; /* Prototype for fake inode numbers. */ static int unp_rights; /* (g) File descriptors in flight. */ static struct unp_head unp_shead; /* (l) List of stream sockets. */ static struct unp_head unp_dhead; /* (l) List of datagram sockets. */ static struct unp_head unp_sphead; /* (l) List of seqpacket sockets. */ struct unp_defer { SLIST_ENTRY(unp_defer) ud_link; struct file *ud_fp; }; static SLIST_HEAD(, unp_defer) unp_defers; static int unp_defers_count; static const struct sockaddr sun_noname = { sizeof(sun_noname), AF_LOCAL }; /* * Garbage collection of cyclic file descriptor/socket references occurs * asynchronously in a taskqueue context in order to avoid recursion and * reentrance in the UNIX domain socket, file descriptor, and socket layer * code. See unp_gc() for a full description. */ static struct timeout_task unp_gc_task; /* * The close of unix domain sockets attached as SCM_RIGHTS is * postponed to the taskqueue, to avoid arbitrary recursion depth. * The attached sockets might have another sockets attached. */ static struct task unp_defer_task; /* * Both send and receive buffers are allocated PIPSIZ bytes of buffering for * stream sockets, although the total for sender and receiver is actually * only PIPSIZ. * * Datagram sockets really use the sendspace as the maximum datagram size, * and don't really want to reserve the sendspace. Their recvspace should be * large enough for at least one max-size datagram plus address. */ #ifndef PIPSIZ #define PIPSIZ 8192 #endif static u_long unpst_sendspace = PIPSIZ; static u_long unpst_recvspace = PIPSIZ; static u_long unpdg_sendspace = 2*1024; /* really max datagram size */ static u_long unpdg_recvspace = 4*1024; static u_long unpsp_sendspace = PIPSIZ; /* really max datagram size */ static u_long unpsp_recvspace = PIPSIZ; static SYSCTL_NODE(_net, PF_LOCAL, local, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Local domain"); static SYSCTL_NODE(_net_local, SOCK_STREAM, stream, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "SOCK_STREAM"); static SYSCTL_NODE(_net_local, SOCK_DGRAM, dgram, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "SOCK_DGRAM"); static SYSCTL_NODE(_net_local, SOCK_SEQPACKET, seqpacket, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "SOCK_SEQPACKET"); SYSCTL_ULONG(_net_local_stream, OID_AUTO, sendspace, CTLFLAG_RW, &unpst_sendspace, 0, "Default stream send space."); SYSCTL_ULONG(_net_local_stream, OID_AUTO, recvspace, CTLFLAG_RW, &unpst_recvspace, 0, "Default stream receive space."); SYSCTL_ULONG(_net_local_dgram, OID_AUTO, maxdgram, CTLFLAG_RW, &unpdg_sendspace, 0, "Default datagram send space."); SYSCTL_ULONG(_net_local_dgram, OID_AUTO, recvspace, CTLFLAG_RW, &unpdg_recvspace, 0, "Default datagram receive space."); SYSCTL_ULONG(_net_local_seqpacket, OID_AUTO, maxseqpacket, CTLFLAG_RW, &unpsp_sendspace, 0, "Default seqpacket send space."); SYSCTL_ULONG(_net_local_seqpacket, OID_AUTO, recvspace, CTLFLAG_RW, &unpsp_recvspace, 0, "Default seqpacket receive space."); SYSCTL_INT(_net_local, OID_AUTO, inflight, CTLFLAG_RD, &unp_rights, 0, "File descriptors in flight."); SYSCTL_INT(_net_local, OID_AUTO, deferred, CTLFLAG_RD, &unp_defers_count, 0, "File descriptors deferred to taskqueue for close."); /* * Locking and synchronization: * * Several types of locks exist in the local domain socket implementation: * - a global linkage lock * - a global connection list lock * - the mtxpool lock * - per-unpcb mutexes * * The linkage lock protects the global socket lists, the generation number * counter and garbage collector state. * * The connection list lock protects the list of referring sockets in a datagram * socket PCB. This lock is also overloaded to protect a global list of * sockets whose buffers contain socket references in the form of SCM_RIGHTS * messages. To avoid recursion, such references are released by a dedicated * thread. * * The mtxpool lock protects the vnode from being modified while referenced. * Lock ordering rules require that it be acquired before any PCB locks. * * The unpcb lock (unp_mtx) protects the most commonly referenced fields in the * unpcb. This includes the unp_conn field, which either links two connected * PCBs together (for connected socket types) or points at the destination * socket (for connectionless socket types). The operations of creating or * destroying a connection therefore involve locking multiple PCBs. To avoid * lock order reversals, in some cases this involves dropping a PCB lock and * using a reference counter to maintain liveness. * * UNIX domain sockets each have an unpcb hung off of their so_pcb pointer, * allocated in pru_attach() and freed in pru_detach(). The validity of that * pointer is an invariant, so no lock is required to dereference the so_pcb * pointer if a valid socket reference is held by the caller. In practice, * this is always true during operations performed on a socket. Each unpcb * has a back-pointer to its socket, unp_socket, which will be stable under * the same circumstances. * * This pointer may only be safely dereferenced as long as a valid reference * to the unpcb is held. Typically, this reference will be from the socket, * or from another unpcb when the referring unpcb's lock is held (in order * that the reference not be invalidated during use). For example, to follow * unp->unp_conn->unp_socket, you need to hold a lock on unp_conn to guarantee * that detach is not run clearing unp_socket. * * Blocking with UNIX domain sockets is a tricky issue: unlike most network * protocols, bind() is a non-atomic operation, and connect() requires * potential sleeping in the protocol, due to potentially waiting on local or * distributed file systems. We try to separate "lookup" operations, which * may sleep, and the IPC operations themselves, which typically can occur * with relative atomicity as locks can be held over the entire operation. * * Another tricky issue is simultaneous multi-threaded or multi-process * access to a single UNIX domain socket. These are handled by the flags * UNP_CONNECTING and UNP_BINDING, which prevent concurrent connecting or * binding, both of which involve dropping UNIX domain socket locks in order * to perform namei() and other file system operations. */ static struct rwlock unp_link_rwlock; static struct mtx unp_defers_lock; #define UNP_LINK_LOCK_INIT() rw_init(&unp_link_rwlock, \ "unp_link_rwlock") #define UNP_LINK_LOCK_ASSERT() rw_assert(&unp_link_rwlock, \ RA_LOCKED) #define UNP_LINK_UNLOCK_ASSERT() rw_assert(&unp_link_rwlock, \ RA_UNLOCKED) #define UNP_LINK_RLOCK() rw_rlock(&unp_link_rwlock) #define UNP_LINK_RUNLOCK() rw_runlock(&unp_link_rwlock) #define UNP_LINK_WLOCK() rw_wlock(&unp_link_rwlock) #define UNP_LINK_WUNLOCK() rw_wunlock(&unp_link_rwlock) #define UNP_LINK_WLOCK_ASSERT() rw_assert(&unp_link_rwlock, \ RA_WLOCKED) #define UNP_LINK_WOWNED() rw_wowned(&unp_link_rwlock) #define UNP_DEFERRED_LOCK_INIT() mtx_init(&unp_defers_lock, \ "unp_defer", NULL, MTX_DEF) #define UNP_DEFERRED_LOCK() mtx_lock(&unp_defers_lock) #define UNP_DEFERRED_UNLOCK() mtx_unlock(&unp_defers_lock) #define UNP_REF_LIST_LOCK() UNP_DEFERRED_LOCK(); #define UNP_REF_LIST_UNLOCK() UNP_DEFERRED_UNLOCK(); #define UNP_PCB_LOCK_INIT(unp) mtx_init(&(unp)->unp_mtx, \ "unp", "unp", \ MTX_DUPOK|MTX_DEF) #define UNP_PCB_LOCK_DESTROY(unp) mtx_destroy(&(unp)->unp_mtx) #define UNP_PCB_LOCKPTR(unp) (&(unp)->unp_mtx) #define UNP_PCB_LOCK(unp) mtx_lock(&(unp)->unp_mtx) #define UNP_PCB_TRYLOCK(unp) mtx_trylock(&(unp)->unp_mtx) #define UNP_PCB_UNLOCK(unp) mtx_unlock(&(unp)->unp_mtx) #define UNP_PCB_OWNED(unp) mtx_owned(&(unp)->unp_mtx) #define UNP_PCB_LOCK_ASSERT(unp) mtx_assert(&(unp)->unp_mtx, MA_OWNED) #define UNP_PCB_UNLOCK_ASSERT(unp) mtx_assert(&(unp)->unp_mtx, MA_NOTOWNED) static int uipc_connect2(struct socket *, struct socket *); static int uipc_ctloutput(struct socket *, struct sockopt *); static int unp_connect(struct socket *, struct sockaddr *, struct thread *); static int unp_connectat(int, struct socket *, struct sockaddr *, struct thread *); static int unp_connect2(struct socket *so, struct socket *so2, int); static void unp_disconnect(struct unpcb *unp, struct unpcb *unp2); static void unp_dispose(struct socket *so); static void unp_dispose_mbuf(struct mbuf *); static void unp_shutdown(struct unpcb *); static void unp_drop(struct unpcb *); static void unp_gc(__unused void *, int); static void unp_scan(struct mbuf *, void (*)(struct filedescent **, int)); static void unp_discard(struct file *); static void unp_freerights(struct filedescent **, int); static void unp_init(void); static int unp_internalize(struct mbuf **, struct thread *); static void unp_internalize_fp(struct file *); static int unp_externalize(struct mbuf *, struct mbuf **, int); static int unp_externalize_fp(struct file *); static struct mbuf *unp_addsockcred(struct thread *, struct mbuf *, int); static void unp_process_defers(void * __unused, int); static void unp_pcb_hold(struct unpcb *unp) { u_int old __unused; old = refcount_acquire(&unp->unp_refcount); KASSERT(old > 0, ("%s: unpcb %p has no references", __func__, unp)); } static __result_use_check bool unp_pcb_rele(struct unpcb *unp) { bool ret; UNP_PCB_LOCK_ASSERT(unp); if ((ret = refcount_release(&unp->unp_refcount))) { UNP_PCB_UNLOCK(unp); UNP_PCB_LOCK_DESTROY(unp); uma_zfree(unp_zone, unp); } return (ret); } static void unp_pcb_rele_notlast(struct unpcb *unp) { bool ret __unused; ret = refcount_release(&unp->unp_refcount); KASSERT(!ret, ("%s: unpcb %p has no references", __func__, unp)); } static void unp_pcb_lock_pair(struct unpcb *unp, struct unpcb *unp2) { UNP_PCB_UNLOCK_ASSERT(unp); UNP_PCB_UNLOCK_ASSERT(unp2); if (unp == unp2) { UNP_PCB_LOCK(unp); } else if ((uintptr_t)unp2 > (uintptr_t)unp) { UNP_PCB_LOCK(unp); UNP_PCB_LOCK(unp2); } else { UNP_PCB_LOCK(unp2); UNP_PCB_LOCK(unp); } } static void unp_pcb_unlock_pair(struct unpcb *unp, struct unpcb *unp2) { UNP_PCB_UNLOCK(unp); if (unp != unp2) UNP_PCB_UNLOCK(unp2); } /* * Try to lock the connected peer of an already locked socket. In some cases * this requires that we unlock the current socket. The pairbusy counter is * used to block concurrent connection attempts while the lock is dropped. The * caller must be careful to revalidate PCB state. */ static struct unpcb * unp_pcb_lock_peer(struct unpcb *unp) { struct unpcb *unp2; UNP_PCB_LOCK_ASSERT(unp); unp2 = unp->unp_conn; if (unp2 == NULL) return (NULL); if (__predict_false(unp == unp2)) return (unp); UNP_PCB_UNLOCK_ASSERT(unp2); if (__predict_true(UNP_PCB_TRYLOCK(unp2))) return (unp2); if ((uintptr_t)unp2 > (uintptr_t)unp) { UNP_PCB_LOCK(unp2); return (unp2); } unp->unp_pairbusy++; unp_pcb_hold(unp2); UNP_PCB_UNLOCK(unp); UNP_PCB_LOCK(unp2); UNP_PCB_LOCK(unp); KASSERT(unp->unp_conn == unp2 || unp->unp_conn == NULL, ("%s: socket %p was reconnected", __func__, unp)); if (--unp->unp_pairbusy == 0 && (unp->unp_flags & UNP_WAITING) != 0) { unp->unp_flags &= ~UNP_WAITING; wakeup(unp); } if (unp_pcb_rele(unp2)) { /* unp2 is unlocked. */ return (NULL); } if (unp->unp_conn == NULL) { UNP_PCB_UNLOCK(unp2); return (NULL); } return (unp2); } /* * Definitions of protocols supported in the LOCAL domain. */ static struct domain localdomain; static struct pr_usrreqs uipc_usrreqs_dgram, uipc_usrreqs_stream; static struct pr_usrreqs uipc_usrreqs_seqpacket; static struct protosw localsw[] = { { .pr_type = SOCK_STREAM, .pr_domain = &localdomain, .pr_flags = PR_CONNREQUIRED|PR_WANTRCVD|PR_RIGHTS| PR_CAPATTACH, .pr_ctloutput = &uipc_ctloutput, .pr_usrreqs = &uipc_usrreqs_stream }, { .pr_type = SOCK_DGRAM, .pr_domain = &localdomain, .pr_flags = PR_ATOMIC|PR_ADDR|PR_RIGHTS|PR_CAPATTACH, .pr_ctloutput = &uipc_ctloutput, .pr_usrreqs = &uipc_usrreqs_dgram }, { .pr_type = SOCK_SEQPACKET, .pr_domain = &localdomain, /* * XXXRW: For now, PR_ADDR because soreceive will bump into them * due to our use of sbappendaddr. A new sbappend variants is needed * that supports both atomic record writes and control data. */ .pr_flags = PR_ADDR|PR_ATOMIC|PR_CONNREQUIRED| PR_WANTRCVD|PR_RIGHTS|PR_CAPATTACH, .pr_ctloutput = &uipc_ctloutput, .pr_usrreqs = &uipc_usrreqs_seqpacket, }, }; static struct domain localdomain = { .dom_family = AF_LOCAL, .dom_name = "local", .dom_init = unp_init, .dom_externalize = unp_externalize, .dom_dispose = unp_dispose, .dom_protosw = localsw, .dom_protoswNPROTOSW = &localsw[nitems(localsw)] }; DOMAIN_SET(local); static void uipc_abort(struct socket *so) { struct unpcb *unp, *unp2; unp = sotounpcb(so); KASSERT(unp != NULL, ("uipc_abort: unp == NULL")); UNP_PCB_UNLOCK_ASSERT(unp); UNP_PCB_LOCK(unp); unp2 = unp->unp_conn; if (unp2 != NULL) { unp_pcb_hold(unp2); UNP_PCB_UNLOCK(unp); unp_drop(unp2); } else UNP_PCB_UNLOCK(unp); } static int uipc_accept(struct socket *so, struct sockaddr **nam) { struct unpcb *unp, *unp2; const struct sockaddr *sa; /* * Pass back name of connected socket, if it was bound and we are * still connected (our peer may have closed already!). */ unp = sotounpcb(so); KASSERT(unp != NULL, ("uipc_accept: unp == NULL")); *nam = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK); UNP_PCB_LOCK(unp); unp2 = unp_pcb_lock_peer(unp); if (unp2 != NULL && unp2->unp_addr != NULL) sa = (struct sockaddr *)unp2->unp_addr; else sa = &sun_noname; bcopy(sa, *nam, sa->sa_len); if (unp2 != NULL) unp_pcb_unlock_pair(unp, unp2); else UNP_PCB_UNLOCK(unp); return (0); } static int uipc_attach(struct socket *so, int proto, struct thread *td) { u_long sendspace, recvspace; struct unpcb *unp; int error; bool locked; KASSERT(so->so_pcb == NULL, ("uipc_attach: so_pcb != NULL")); if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) { switch (so->so_type) { case SOCK_STREAM: sendspace = unpst_sendspace; recvspace = unpst_recvspace; break; case SOCK_DGRAM: sendspace = unpdg_sendspace; recvspace = unpdg_recvspace; break; case SOCK_SEQPACKET: sendspace = unpsp_sendspace; recvspace = unpsp_recvspace; break; default: panic("uipc_attach"); } error = soreserve(so, sendspace, recvspace); if (error) return (error); } unp = uma_zalloc(unp_zone, M_NOWAIT | M_ZERO); if (unp == NULL) return (ENOBUFS); LIST_INIT(&unp->unp_refs); UNP_PCB_LOCK_INIT(unp); unp->unp_socket = so; so->so_pcb = unp; refcount_init(&unp->unp_refcount, 1); if ((locked = UNP_LINK_WOWNED()) == false) UNP_LINK_WLOCK(); unp->unp_gencnt = ++unp_gencnt; unp->unp_ino = ++unp_ino; unp_count++; switch (so->so_type) { case SOCK_STREAM: LIST_INSERT_HEAD(&unp_shead, unp, unp_link); break; case SOCK_DGRAM: LIST_INSERT_HEAD(&unp_dhead, unp, unp_link); break; case SOCK_SEQPACKET: LIST_INSERT_HEAD(&unp_sphead, unp, unp_link); break; default: panic("uipc_attach"); } if (locked == false) UNP_LINK_WUNLOCK(); return (0); } static int uipc_bindat(int fd, struct socket *so, struct sockaddr *nam, struct thread *td) { struct sockaddr_un *soun = (struct sockaddr_un *)nam; struct vattr vattr; int error, namelen; struct nameidata nd; struct unpcb *unp; struct vnode *vp; struct mount *mp; cap_rights_t rights; char *buf; if (nam->sa_family != AF_UNIX) return (EAFNOSUPPORT); unp = sotounpcb(so); KASSERT(unp != NULL, ("uipc_bind: unp == NULL")); if (soun->sun_len > sizeof(struct sockaddr_un)) return (EINVAL); namelen = soun->sun_len - offsetof(struct sockaddr_un, sun_path); if (namelen <= 0) return (EINVAL); /* * We don't allow simultaneous bind() calls on a single UNIX domain * socket, so flag in-progress operations, and return an error if an * operation is already in progress. * * Historically, we have not allowed a socket to be rebound, so this * also returns an error. Not allowing re-binding simplifies the * implementation and avoids a great many possible failure modes. */ UNP_PCB_LOCK(unp); if (unp->unp_vnode != NULL) { UNP_PCB_UNLOCK(unp); return (EINVAL); } if (unp->unp_flags & UNP_BINDING) { UNP_PCB_UNLOCK(unp); return (EALREADY); } unp->unp_flags |= UNP_BINDING; UNP_PCB_UNLOCK(unp); buf = malloc(namelen + 1, M_TEMP, M_WAITOK); bcopy(soun->sun_path, buf, namelen); buf[namelen] = 0; restart: NDINIT_ATRIGHTS(&nd, CREATE, NOFOLLOW | LOCKPARENT | SAVENAME | NOCACHE, UIO_SYSSPACE, buf, fd, cap_rights_init_one(&rights, CAP_BINDAT), td); /* SHOULD BE ABLE TO ADOPT EXISTING AND wakeup() ALA FIFO's */ error = namei(&nd); if (error) goto error; vp = nd.ni_vp; if (vp != NULL || vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { NDFREE(&nd, NDF_ONLY_PNBUF); if (nd.ni_dvp == vp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); if (vp != NULL) { vrele(vp); error = EADDRINUSE; goto error; } error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH); if (error) goto error; goto restart; } VATTR_NULL(&vattr); vattr.va_type = VSOCK; vattr.va_mode = (ACCESSPERMS & ~td->td_proc->p_pd->pd_cmask); #ifdef MAC error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd, &vattr); #endif if (error == 0) error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr); NDFREE(&nd, NDF_ONLY_PNBUF); if (error) { VOP_VPUT_PAIR(nd.ni_dvp, NULL, true); vn_finished_write(mp); if (error == ERELOOKUP) goto restart; goto error; } vp = nd.ni_vp; ASSERT_VOP_ELOCKED(vp, "uipc_bind"); soun = (struct sockaddr_un *)sodupsockaddr(nam, M_WAITOK); UNP_PCB_LOCK(unp); VOP_UNP_BIND(vp, unp); unp->unp_vnode = vp; unp->unp_addr = soun; unp->unp_flags &= ~UNP_BINDING; UNP_PCB_UNLOCK(unp); vref(vp); VOP_VPUT_PAIR(nd.ni_dvp, &vp, true); vn_finished_write(mp); free(buf, M_TEMP); return (0); error: UNP_PCB_LOCK(unp); unp->unp_flags &= ~UNP_BINDING; UNP_PCB_UNLOCK(unp); free(buf, M_TEMP); return (error); } static int uipc_bind(struct socket *so, struct sockaddr *nam, struct thread *td) { return (uipc_bindat(AT_FDCWD, so, nam, td)); } static int uipc_connect(struct socket *so, struct sockaddr *nam, struct thread *td) { int error; KASSERT(td == curthread, ("uipc_connect: td != curthread")); error = unp_connect(so, nam, td); return (error); } static int uipc_connectat(int fd, struct socket *so, struct sockaddr *nam, struct thread *td) { int error; KASSERT(td == curthread, ("uipc_connectat: td != curthread")); error = unp_connectat(fd, so, nam, td); return (error); } static void uipc_close(struct socket *so) { struct unpcb *unp, *unp2; struct vnode *vp = NULL; struct mtx *vplock; unp = sotounpcb(so); KASSERT(unp != NULL, ("uipc_close: unp == NULL")); vplock = NULL; if ((vp = unp->unp_vnode) != NULL) { vplock = mtx_pool_find(mtxpool_sleep, vp); mtx_lock(vplock); } UNP_PCB_LOCK(unp); if (vp && unp->unp_vnode == NULL) { mtx_unlock(vplock); vp = NULL; } if (vp != NULL) { VOP_UNP_DETACH(vp); unp->unp_vnode = NULL; } if ((unp2 = unp_pcb_lock_peer(unp)) != NULL) unp_disconnect(unp, unp2); else UNP_PCB_UNLOCK(unp); if (vp) { mtx_unlock(vplock); vrele(vp); } } static int uipc_connect2(struct socket *so1, struct socket *so2) { struct unpcb *unp, *unp2; int error; unp = so1->so_pcb; KASSERT(unp != NULL, ("uipc_connect2: unp == NULL")); unp2 = so2->so_pcb; KASSERT(unp2 != NULL, ("uipc_connect2: unp2 == NULL")); unp_pcb_lock_pair(unp, unp2); error = unp_connect2(so1, so2, PRU_CONNECT2); unp_pcb_unlock_pair(unp, unp2); return (error); } static void uipc_detach(struct socket *so) { struct unpcb *unp, *unp2; struct mtx *vplock; struct vnode *vp; int local_unp_rights; unp = sotounpcb(so); KASSERT(unp != NULL, ("uipc_detach: unp == NULL")); vp = NULL; vplock = NULL; SOCK_LOCK(so); if (!SOLISTENING(so)) { /* * Once the socket is removed from the global lists, * uipc_ready() will not be able to locate its socket buffer, so * clear the buffer now. At this point internalized rights have * already been disposed of. */ sbrelease(&so->so_rcv, so); } SOCK_UNLOCK(so); UNP_LINK_WLOCK(); LIST_REMOVE(unp, unp_link); if (unp->unp_gcflag & UNPGC_DEAD) LIST_REMOVE(unp, unp_dead); unp->unp_gencnt = ++unp_gencnt; --unp_count; UNP_LINK_WUNLOCK(); UNP_PCB_UNLOCK_ASSERT(unp); restart: if ((vp = unp->unp_vnode) != NULL) { vplock = mtx_pool_find(mtxpool_sleep, vp); mtx_lock(vplock); } UNP_PCB_LOCK(unp); if (unp->unp_vnode != vp && unp->unp_vnode != NULL) { if (vplock) mtx_unlock(vplock); UNP_PCB_UNLOCK(unp); goto restart; } if ((vp = unp->unp_vnode) != NULL) { VOP_UNP_DETACH(vp); unp->unp_vnode = NULL; } if ((unp2 = unp_pcb_lock_peer(unp)) != NULL) unp_disconnect(unp, unp2); else UNP_PCB_UNLOCK(unp); UNP_REF_LIST_LOCK(); while (!LIST_EMPTY(&unp->unp_refs)) { struct unpcb *ref = LIST_FIRST(&unp->unp_refs); unp_pcb_hold(ref); UNP_REF_LIST_UNLOCK(); MPASS(ref != unp); UNP_PCB_UNLOCK_ASSERT(ref); unp_drop(ref); UNP_REF_LIST_LOCK(); } UNP_REF_LIST_UNLOCK(); UNP_PCB_LOCK(unp); local_unp_rights = unp_rights; unp->unp_socket->so_pcb = NULL; unp->unp_socket = NULL; free(unp->unp_addr, M_SONAME); unp->unp_addr = NULL; if (!unp_pcb_rele(unp)) UNP_PCB_UNLOCK(unp); if (vp) { mtx_unlock(vplock); vrele(vp); } if (local_unp_rights) taskqueue_enqueue_timeout(taskqueue_thread, &unp_gc_task, -1); } static int uipc_disconnect(struct socket *so) { struct unpcb *unp, *unp2; unp = sotounpcb(so); KASSERT(unp != NULL, ("uipc_disconnect: unp == NULL")); UNP_PCB_LOCK(unp); if ((unp2 = unp_pcb_lock_peer(unp)) != NULL) unp_disconnect(unp, unp2); else UNP_PCB_UNLOCK(unp); return (0); } static int uipc_listen(struct socket *so, int backlog, struct thread *td) { struct unpcb *unp; int error; if (so->so_type != SOCK_STREAM && so->so_type != SOCK_SEQPACKET) return (EOPNOTSUPP); unp = sotounpcb(so); KASSERT(unp != NULL, ("uipc_listen: unp == NULL")); UNP_PCB_LOCK(unp); if (unp->unp_vnode == NULL) { /* Already connected or not bound to an address. */ error = unp->unp_conn != NULL ? EINVAL : EDESTADDRREQ; UNP_PCB_UNLOCK(unp); return (error); } SOCK_LOCK(so); error = solisten_proto_check(so); if (error == 0) { cru2xt(td, &unp->unp_peercred); solisten_proto(so, backlog); } SOCK_UNLOCK(so); UNP_PCB_UNLOCK(unp); return (error); } static int uipc_peeraddr(struct socket *so, struct sockaddr **nam) { struct unpcb *unp, *unp2; const struct sockaddr *sa; unp = sotounpcb(so); KASSERT(unp != NULL, ("uipc_peeraddr: unp == NULL")); *nam = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK); UNP_LINK_RLOCK(); /* * XXX: It seems that this test always fails even when connection is * established. So, this else clause is added as workaround to * return PF_LOCAL sockaddr. */ unp2 = unp->unp_conn; if (unp2 != NULL) { UNP_PCB_LOCK(unp2); if (unp2->unp_addr != NULL) sa = (struct sockaddr *) unp2->unp_addr; else sa = &sun_noname; bcopy(sa, *nam, sa->sa_len); UNP_PCB_UNLOCK(unp2); } else { sa = &sun_noname; bcopy(sa, *nam, sa->sa_len); } UNP_LINK_RUNLOCK(); return (0); } static int uipc_rcvd(struct socket *so, int flags) { struct unpcb *unp, *unp2; struct socket *so2; u_int mbcnt, sbcc; unp = sotounpcb(so); KASSERT(unp != NULL, ("%s: unp == NULL", __func__)); KASSERT(so->so_type == SOCK_STREAM || so->so_type == SOCK_SEQPACKET, ("%s: socktype %d", __func__, so->so_type)); /* * Adjust backpressure on sender and wakeup any waiting to write. * * The unp lock is acquired to maintain the validity of the unp_conn * pointer; no lock on unp2 is required as unp2->unp_socket will be * static as long as we don't permit unp2 to disconnect from unp, * which is prevented by the lock on unp. We cache values from * so_rcv to avoid holding the so_rcv lock over the entire * transaction on the remote so_snd. */ SOCKBUF_LOCK(&so->so_rcv); mbcnt = so->so_rcv.sb_mbcnt; sbcc = sbavail(&so->so_rcv); SOCKBUF_UNLOCK(&so->so_rcv); /* * There is a benign race condition at this point. If we're planning to * clear SB_STOP, but uipc_send is called on the connected socket at * this instant, it might add data to the sockbuf and set SB_STOP. Then * we would erroneously clear SB_STOP below, even though the sockbuf is * full. The race is benign because the only ill effect is to allow the * sockbuf to exceed its size limit, and the size limits are not * strictly guaranteed anyway. */ UNP_PCB_LOCK(unp); unp2 = unp->unp_conn; if (unp2 == NULL) { UNP_PCB_UNLOCK(unp); return (0); } so2 = unp2->unp_socket; SOCKBUF_LOCK(&so2->so_snd); if (sbcc < so2->so_snd.sb_hiwat && mbcnt < so2->so_snd.sb_mbmax) so2->so_snd.sb_flags &= ~SB_STOP; sowwakeup_locked(so2); UNP_PCB_UNLOCK(unp); return (0); } static int uipc_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, struct mbuf *control, struct thread *td) { struct unpcb *unp, *unp2; struct socket *so2; u_int mbcnt, sbcc; int freed, error; unp = sotounpcb(so); KASSERT(unp != NULL, ("%s: unp == NULL", __func__)); KASSERT(so->so_type == SOCK_STREAM || so->so_type == SOCK_DGRAM || so->so_type == SOCK_SEQPACKET, ("%s: socktype %d", __func__, so->so_type)); freed = error = 0; if (flags & PRUS_OOB) { error = EOPNOTSUPP; goto release; } if (control != NULL && (error = unp_internalize(&control, td))) goto release; unp2 = NULL; switch (so->so_type) { case SOCK_DGRAM: { const struct sockaddr *from; if (nam != NULL) { error = unp_connect(so, nam, td); if (error != 0) break; } UNP_PCB_LOCK(unp); /* * Because connect() and send() are non-atomic in a sendto() * with a target address, it's possible that the socket will * have disconnected before the send() can run. In that case * return the slightly counter-intuitive but otherwise * correct error that the socket is not connected. */ unp2 = unp_pcb_lock_peer(unp); if (unp2 == NULL) { UNP_PCB_UNLOCK(unp); error = ENOTCONN; break; } if (unp2->unp_flags & UNP_WANTCRED_MASK) control = unp_addsockcred(td, control, unp2->unp_flags); if (unp->unp_addr != NULL) from = (struct sockaddr *)unp->unp_addr; else from = &sun_noname; so2 = unp2->unp_socket; SOCKBUF_LOCK(&so2->so_rcv); if (sbappendaddr_locked(&so2->so_rcv, from, m, control)) { sorwakeup_locked(so2); m = NULL; control = NULL; } else { SOCKBUF_UNLOCK(&so2->so_rcv); error = ENOBUFS; } if (nam != NULL) unp_disconnect(unp, unp2); else unp_pcb_unlock_pair(unp, unp2); break; } case SOCK_SEQPACKET: case SOCK_STREAM: if ((so->so_state & SS_ISCONNECTED) == 0) { if (nam != NULL) { error = unp_connect(so, nam, td); if (error != 0) break; } else { error = ENOTCONN; break; } } UNP_PCB_LOCK(unp); if ((unp2 = unp_pcb_lock_peer(unp)) == NULL) { UNP_PCB_UNLOCK(unp); error = ENOTCONN; break; } else if (so->so_snd.sb_state & SBS_CANTSENDMORE) { unp_pcb_unlock_pair(unp, unp2); error = EPIPE; break; } UNP_PCB_UNLOCK(unp); if ((so2 = unp2->unp_socket) == NULL) { UNP_PCB_UNLOCK(unp2); error = ENOTCONN; break; } SOCKBUF_LOCK(&so2->so_rcv); if (unp2->unp_flags & UNP_WANTCRED_MASK) { /* * Credentials are passed only once on SOCK_STREAM and * SOCK_SEQPACKET (LOCAL_CREDS => WANTCRED_ONESHOT), or * forever (LOCAL_CREDS_PERSISTENT => WANTCRED_ALWAYS). */ control = unp_addsockcred(td, control, unp2->unp_flags); unp2->unp_flags &= ~UNP_WANTCRED_ONESHOT; } /* * Send to paired receive port and wake up readers. Don't * check for space available in the receive buffer if we're * attaching ancillary data; Unix domain sockets only check * for space in the sending sockbuf, and that check is * performed one level up the stack. At that level we cannot * precisely account for the amount of buffer space used * (e.g., because control messages are not yet internalized). */ switch (so->so_type) { case SOCK_STREAM: if (control != NULL) { sbappendcontrol_locked(&so2->so_rcv, m, control, flags); control = NULL; } else sbappend_locked(&so2->so_rcv, m, flags); break; case SOCK_SEQPACKET: if (sbappendaddr_nospacecheck_locked(&so2->so_rcv, &sun_noname, m, control)) control = NULL; break; } mbcnt = so2->so_rcv.sb_mbcnt; sbcc = sbavail(&so2->so_rcv); if (sbcc) sorwakeup_locked(so2); else SOCKBUF_UNLOCK(&so2->so_rcv); /* * The PCB lock on unp2 protects the SB_STOP flag. Without it, * it would be possible for uipc_rcvd to be called at this * point, drain the receiving sockbuf, clear SB_STOP, and then * we would set SB_STOP below. That could lead to an empty * sockbuf having SB_STOP set */ SOCKBUF_LOCK(&so->so_snd); if (sbcc >= so->so_snd.sb_hiwat || mbcnt >= so->so_snd.sb_mbmax) so->so_snd.sb_flags |= SB_STOP; SOCKBUF_UNLOCK(&so->so_snd); UNP_PCB_UNLOCK(unp2); m = NULL; break; } /* * PRUS_EOF is equivalent to pru_send followed by pru_shutdown. */ if (flags & PRUS_EOF) { UNP_PCB_LOCK(unp); socantsendmore(so); unp_shutdown(unp); UNP_PCB_UNLOCK(unp); } if (control != NULL && error != 0) unp_dispose_mbuf(control); release: if (control != NULL) m_freem(control); /* * In case of PRUS_NOTREADY, uipc_ready() is responsible * for freeing memory. */ if (m != NULL && (flags & PRUS_NOTREADY) == 0) m_freem(m); return (error); } static bool uipc_ready_scan(struct socket *so, struct mbuf *m, int count, int *errorp) { struct mbuf *mb, *n; struct sockbuf *sb; SOCK_LOCK(so); if (SOLISTENING(so)) { SOCK_UNLOCK(so); return (false); } mb = NULL; sb = &so->so_rcv; SOCKBUF_LOCK(sb); if (sb->sb_fnrdy != NULL) { for (mb = sb->sb_mb, n = mb->m_nextpkt; mb != NULL;) { if (mb == m) { *errorp = sbready(sb, m, count); break; } mb = mb->m_next; if (mb == NULL) { mb = n; if (mb != NULL) n = mb->m_nextpkt; } } } SOCKBUF_UNLOCK(sb); SOCK_UNLOCK(so); return (mb != NULL); } static int uipc_ready(struct socket *so, struct mbuf *m, int count) { struct unpcb *unp, *unp2; struct socket *so2; int error, i; unp = sotounpcb(so); KASSERT(so->so_type == SOCK_STREAM, ("%s: unexpected socket type for %p", __func__, so)); UNP_PCB_LOCK(unp); if ((unp2 = unp_pcb_lock_peer(unp)) != NULL) { UNP_PCB_UNLOCK(unp); so2 = unp2->unp_socket; SOCKBUF_LOCK(&so2->so_rcv); if ((error = sbready(&so2->so_rcv, m, count)) == 0) sorwakeup_locked(so2); else SOCKBUF_UNLOCK(&so2->so_rcv); UNP_PCB_UNLOCK(unp2); return (error); } UNP_PCB_UNLOCK(unp); /* * The receiving socket has been disconnected, but may still be valid. * In this case, the now-ready mbufs are still present in its socket * buffer, so perform an exhaustive search before giving up and freeing * the mbufs. */ UNP_LINK_RLOCK(); LIST_FOREACH(unp, &unp_shead, unp_link) { if (uipc_ready_scan(unp->unp_socket, m, count, &error)) break; } UNP_LINK_RUNLOCK(); if (unp == NULL) { for (i = 0; i < count; i++) m = m_free(m); error = ECONNRESET; } return (error); } static int uipc_sense(struct socket *so, struct stat *sb) { struct unpcb *unp; unp = sotounpcb(so); KASSERT(unp != NULL, ("uipc_sense: unp == NULL")); sb->st_blksize = so->so_snd.sb_hiwat; sb->st_dev = NODEV; sb->st_ino = unp->unp_ino; return (0); } static int uipc_shutdown(struct socket *so) { struct unpcb *unp; unp = sotounpcb(so); KASSERT(unp != NULL, ("uipc_shutdown: unp == NULL")); UNP_PCB_LOCK(unp); socantsendmore(so); unp_shutdown(unp); UNP_PCB_UNLOCK(unp); return (0); } static int uipc_sockaddr(struct socket *so, struct sockaddr **nam) { struct unpcb *unp; const struct sockaddr *sa; unp = sotounpcb(so); KASSERT(unp != NULL, ("uipc_sockaddr: unp == NULL")); *nam = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK); UNP_PCB_LOCK(unp); if (unp->unp_addr != NULL) sa = (struct sockaddr *) unp->unp_addr; else sa = &sun_noname; bcopy(sa, *nam, sa->sa_len); UNP_PCB_UNLOCK(unp); return (0); } static struct pr_usrreqs uipc_usrreqs_dgram = { .pru_abort = uipc_abort, .pru_accept = uipc_accept, .pru_attach = uipc_attach, .pru_bind = uipc_bind, .pru_bindat = uipc_bindat, .pru_connect = uipc_connect, .pru_connectat = uipc_connectat, .pru_connect2 = uipc_connect2, .pru_detach = uipc_detach, .pru_disconnect = uipc_disconnect, .pru_listen = uipc_listen, .pru_peeraddr = uipc_peeraddr, .pru_rcvd = uipc_rcvd, .pru_send = uipc_send, .pru_sense = uipc_sense, .pru_shutdown = uipc_shutdown, .pru_sockaddr = uipc_sockaddr, .pru_soreceive = soreceive_dgram, .pru_close = uipc_close, }; static struct pr_usrreqs uipc_usrreqs_seqpacket = { .pru_abort = uipc_abort, .pru_accept = uipc_accept, .pru_attach = uipc_attach, .pru_bind = uipc_bind, .pru_bindat = uipc_bindat, .pru_connect = uipc_connect, .pru_connectat = uipc_connectat, .pru_connect2 = uipc_connect2, .pru_detach = uipc_detach, .pru_disconnect = uipc_disconnect, .pru_listen = uipc_listen, .pru_peeraddr = uipc_peeraddr, .pru_rcvd = uipc_rcvd, .pru_send = uipc_send, .pru_sense = uipc_sense, .pru_shutdown = uipc_shutdown, .pru_sockaddr = uipc_sockaddr, .pru_soreceive = soreceive_generic, /* XXX: or...? */ .pru_close = uipc_close, }; static struct pr_usrreqs uipc_usrreqs_stream = { .pru_abort = uipc_abort, .pru_accept = uipc_accept, .pru_attach = uipc_attach, .pru_bind = uipc_bind, .pru_bindat = uipc_bindat, .pru_connect = uipc_connect, .pru_connectat = uipc_connectat, .pru_connect2 = uipc_connect2, .pru_detach = uipc_detach, .pru_disconnect = uipc_disconnect, .pru_listen = uipc_listen, .pru_peeraddr = uipc_peeraddr, .pru_rcvd = uipc_rcvd, .pru_send = uipc_send, .pru_ready = uipc_ready, .pru_sense = uipc_sense, .pru_shutdown = uipc_shutdown, .pru_sockaddr = uipc_sockaddr, .pru_soreceive = soreceive_generic, .pru_close = uipc_close, }; static int uipc_ctloutput(struct socket *so, struct sockopt *sopt) { struct unpcb *unp; struct xucred xu; int error, optval; if (sopt->sopt_level != SOL_LOCAL) return (EINVAL); unp = sotounpcb(so); KASSERT(unp != NULL, ("uipc_ctloutput: unp == NULL")); error = 0; switch (sopt->sopt_dir) { case SOPT_GET: switch (sopt->sopt_name) { case LOCAL_PEERCRED: UNP_PCB_LOCK(unp); if (unp->unp_flags & UNP_HAVEPC) xu = unp->unp_peercred; else { if (so->so_type == SOCK_STREAM) error = ENOTCONN; else error = EINVAL; } UNP_PCB_UNLOCK(unp); if (error == 0) error = sooptcopyout(sopt, &xu, sizeof(xu)); break; case LOCAL_CREDS: /* Unlocked read. */ optval = unp->unp_flags & UNP_WANTCRED_ONESHOT ? 1 : 0; error = sooptcopyout(sopt, &optval, sizeof(optval)); break; case LOCAL_CREDS_PERSISTENT: /* Unlocked read. */ optval = unp->unp_flags & UNP_WANTCRED_ALWAYS ? 1 : 0; error = sooptcopyout(sopt, &optval, sizeof(optval)); break; case LOCAL_CONNWAIT: /* Unlocked read. */ optval = unp->unp_flags & UNP_CONNWAIT ? 1 : 0; error = sooptcopyout(sopt, &optval, sizeof(optval)); break; default: error = EOPNOTSUPP; break; } break; case SOPT_SET: switch (sopt->sopt_name) { case LOCAL_CREDS: case LOCAL_CREDS_PERSISTENT: case LOCAL_CONNWAIT: error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval)); if (error) break; #define OPTSET(bit, exclusive) do { \ UNP_PCB_LOCK(unp); \ if (optval) { \ if ((unp->unp_flags & (exclusive)) != 0) { \ UNP_PCB_UNLOCK(unp); \ error = EINVAL; \ break; \ } \ unp->unp_flags |= (bit); \ } else \ unp->unp_flags &= ~(bit); \ UNP_PCB_UNLOCK(unp); \ } while (0) switch (sopt->sopt_name) { case LOCAL_CREDS: OPTSET(UNP_WANTCRED_ONESHOT, UNP_WANTCRED_ALWAYS); break; case LOCAL_CREDS_PERSISTENT: OPTSET(UNP_WANTCRED_ALWAYS, UNP_WANTCRED_ONESHOT); break; case LOCAL_CONNWAIT: OPTSET(UNP_CONNWAIT, 0); break; default: break; } break; #undef OPTSET default: error = ENOPROTOOPT; break; } break; default: error = EOPNOTSUPP; break; } return (error); } static int unp_connect(struct socket *so, struct sockaddr *nam, struct thread *td) { return (unp_connectat(AT_FDCWD, so, nam, td)); } static int unp_connectat(int fd, struct socket *so, struct sockaddr *nam, struct thread *td) { struct mtx *vplock; struct sockaddr_un *soun; struct vnode *vp; struct socket *so2; struct unpcb *unp, *unp2, *unp3; struct nameidata nd; char buf[SOCK_MAXADDRLEN]; struct sockaddr *sa; cap_rights_t rights; int error, len; bool connreq; if (nam->sa_family != AF_UNIX) return (EAFNOSUPPORT); if (nam->sa_len > sizeof(struct sockaddr_un)) return (EINVAL); len = nam->sa_len - offsetof(struct sockaddr_un, sun_path); if (len <= 0) return (EINVAL); soun = (struct sockaddr_un *)nam; bcopy(soun->sun_path, buf, len); buf[len] = 0; unp = sotounpcb(so); UNP_PCB_LOCK(unp); for (;;) { /* * Wait for connection state to stabilize. If a connection * already exists, give up. For datagram sockets, which permit * multiple consecutive connect(2) calls, upper layers are * responsible for disconnecting in advance of a subsequent * connect(2), but this is not synchronized with PCB connection * state. * * Also make sure that no threads are currently attempting to * lock the peer socket, to ensure that unp_conn cannot * transition between two valid sockets while locks are dropped. */ if (unp->unp_conn != NULL) { UNP_PCB_UNLOCK(unp); return (EISCONN); } if ((unp->unp_flags & UNP_CONNECTING) != 0) { UNP_PCB_UNLOCK(unp); return (EALREADY); } if (unp->unp_pairbusy > 0) { unp->unp_flags |= UNP_WAITING; mtx_sleep(unp, UNP_PCB_LOCKPTR(unp), 0, "unpeer", 0); continue; } break; } unp->unp_flags |= UNP_CONNECTING; UNP_PCB_UNLOCK(unp); connreq = (so->so_proto->pr_flags & PR_CONNREQUIRED) != 0; if (connreq) sa = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK); else sa = NULL; NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF, UIO_SYSSPACE, buf, fd, cap_rights_init_one(&rights, CAP_CONNECTAT), td); error = namei(&nd); if (error) vp = NULL; else vp = nd.ni_vp; ASSERT_VOP_LOCKED(vp, "unp_connect"); NDFREE_NOTHING(&nd); if (error) goto bad; if (vp->v_type != VSOCK) { error = ENOTSOCK; goto bad; } #ifdef MAC error = mac_vnode_check_open(td->td_ucred, vp, VWRITE | VREAD); if (error) goto bad; #endif error = VOP_ACCESS(vp, VWRITE, td->td_ucred, td); if (error) goto bad; unp = sotounpcb(so); KASSERT(unp != NULL, ("unp_connect: unp == NULL")); vplock = mtx_pool_find(mtxpool_sleep, vp); mtx_lock(vplock); VOP_UNP_CONNECT(vp, &unp2); if (unp2 == NULL) { error = ECONNREFUSED; goto bad2; } so2 = unp2->unp_socket; if (so->so_type != so2->so_type) { error = EPROTOTYPE; goto bad2; } if (connreq) { - if (so2->so_options & SO_ACCEPTCONN) { + if (SOLISTENING(so2)) { CURVNET_SET(so2->so_vnet); so2 = sonewconn(so2, 0); CURVNET_RESTORE(); } else so2 = NULL; if (so2 == NULL) { error = ECONNREFUSED; goto bad2; } unp3 = sotounpcb(so2); unp_pcb_lock_pair(unp2, unp3); if (unp2->unp_addr != NULL) { bcopy(unp2->unp_addr, sa, unp2->unp_addr->sun_len); unp3->unp_addr = (struct sockaddr_un *) sa; sa = NULL; } unp_copy_peercred(td, unp3, unp, unp2); UNP_PCB_UNLOCK(unp2); unp2 = unp3; /* * It is safe to block on the PCB lock here since unp2 is * nascent and cannot be connected to any other sockets. */ UNP_PCB_LOCK(unp); #ifdef MAC mac_socketpeer_set_from_socket(so, so2); mac_socketpeer_set_from_socket(so2, so); #endif } else { unp_pcb_lock_pair(unp, unp2); } KASSERT(unp2 != NULL && so2 != NULL && unp2->unp_socket == so2 && sotounpcb(so2) == unp2, ("%s: unp2 %p so2 %p", __func__, unp2, so2)); error = unp_connect2(so, so2, PRU_CONNECT); unp_pcb_unlock_pair(unp, unp2); bad2: mtx_unlock(vplock); bad: if (vp != NULL) { vput(vp); } free(sa, M_SONAME); UNP_PCB_LOCK(unp); KASSERT((unp->unp_flags & UNP_CONNECTING) != 0, ("%s: unp %p has UNP_CONNECTING clear", __func__, unp)); unp->unp_flags &= ~UNP_CONNECTING; UNP_PCB_UNLOCK(unp); return (error); } /* * Set socket peer credentials at connection time. * * The client's PCB credentials are copied from its process structure. The * server's PCB credentials are copied from the socket on which it called * listen(2). uipc_listen cached that process's credentials at the time. */ void unp_copy_peercred(struct thread *td, struct unpcb *client_unp, struct unpcb *server_unp, struct unpcb *listen_unp) { cru2xt(td, &client_unp->unp_peercred); client_unp->unp_flags |= UNP_HAVEPC; memcpy(&server_unp->unp_peercred, &listen_unp->unp_peercred, sizeof(server_unp->unp_peercred)); server_unp->unp_flags |= UNP_HAVEPC; client_unp->unp_flags |= (listen_unp->unp_flags & UNP_WANTCRED_MASK); } static int unp_connect2(struct socket *so, struct socket *so2, int req) { struct unpcb *unp; struct unpcb *unp2; unp = sotounpcb(so); KASSERT(unp != NULL, ("unp_connect2: unp == NULL")); unp2 = sotounpcb(so2); KASSERT(unp2 != NULL, ("unp_connect2: unp2 == NULL")); UNP_PCB_LOCK_ASSERT(unp); UNP_PCB_LOCK_ASSERT(unp2); KASSERT(unp->unp_conn == NULL, ("%s: socket %p is already connected", __func__, unp)); if (so2->so_type != so->so_type) return (EPROTOTYPE); unp->unp_conn = unp2; unp_pcb_hold(unp2); unp_pcb_hold(unp); switch (so->so_type) { case SOCK_DGRAM: UNP_REF_LIST_LOCK(); LIST_INSERT_HEAD(&unp2->unp_refs, unp, unp_reflink); UNP_REF_LIST_UNLOCK(); soisconnected(so); break; case SOCK_STREAM: case SOCK_SEQPACKET: KASSERT(unp2->unp_conn == NULL, ("%s: socket %p is already connected", __func__, unp2)); unp2->unp_conn = unp; if (req == PRU_CONNECT && ((unp->unp_flags | unp2->unp_flags) & UNP_CONNWAIT)) soisconnecting(so); else soisconnected(so); soisconnected(so2); break; default: panic("unp_connect2"); } return (0); } static void unp_disconnect(struct unpcb *unp, struct unpcb *unp2) { struct socket *so, *so2; #ifdef INVARIANTS struct unpcb *unptmp; #endif UNP_PCB_LOCK_ASSERT(unp); UNP_PCB_LOCK_ASSERT(unp2); KASSERT(unp->unp_conn == unp2, ("%s: unpcb %p is not connected to %p", __func__, unp, unp2)); unp->unp_conn = NULL; so = unp->unp_socket; so2 = unp2->unp_socket; switch (unp->unp_socket->so_type) { case SOCK_DGRAM: UNP_REF_LIST_LOCK(); #ifdef INVARIANTS LIST_FOREACH(unptmp, &unp2->unp_refs, unp_reflink) { if (unptmp == unp) break; } KASSERT(unptmp != NULL, ("%s: %p not found in reflist of %p", __func__, unp, unp2)); #endif LIST_REMOVE(unp, unp_reflink); UNP_REF_LIST_UNLOCK(); if (so) { SOCK_LOCK(so); so->so_state &= ~SS_ISCONNECTED; SOCK_UNLOCK(so); } break; case SOCK_STREAM: case SOCK_SEQPACKET: if (so) soisdisconnected(so); MPASS(unp2->unp_conn == unp); unp2->unp_conn = NULL; if (so2) soisdisconnected(so2); break; } if (unp == unp2) { unp_pcb_rele_notlast(unp); if (!unp_pcb_rele(unp)) UNP_PCB_UNLOCK(unp); } else { if (!unp_pcb_rele(unp)) UNP_PCB_UNLOCK(unp); if (!unp_pcb_rele(unp2)) UNP_PCB_UNLOCK(unp2); } } /* * unp_pcblist() walks the global list of struct unpcb's to generate a * pointer list, bumping the refcount on each unpcb. It then copies them out * sequentially, validating the generation number on each to see if it has * been detached. All of this is necessary because copyout() may sleep on * disk I/O. */ static int unp_pcblist(SYSCTL_HANDLER_ARGS) { struct unpcb *unp, **unp_list; unp_gen_t gencnt; struct xunpgen *xug; struct unp_head *head; struct xunpcb *xu; u_int i; int error, n; switch ((intptr_t)arg1) { case SOCK_STREAM: head = &unp_shead; break; case SOCK_DGRAM: head = &unp_dhead; break; case SOCK_SEQPACKET: head = &unp_sphead; break; default: panic("unp_pcblist: arg1 %d", (int)(intptr_t)arg1); } /* * The process of preparing the PCB list is too time-consuming and * resource-intensive to repeat twice on every request. */ if (req->oldptr == NULL) { n = unp_count; req->oldidx = 2 * (sizeof *xug) + (n + n/8) * sizeof(struct xunpcb); return (0); } if (req->newptr != NULL) return (EPERM); /* * OK, now we're committed to doing something. */ xug = malloc(sizeof(*xug), M_TEMP, M_WAITOK | M_ZERO); UNP_LINK_RLOCK(); gencnt = unp_gencnt; n = unp_count; UNP_LINK_RUNLOCK(); xug->xug_len = sizeof *xug; xug->xug_count = n; xug->xug_gen = gencnt; xug->xug_sogen = so_gencnt; error = SYSCTL_OUT(req, xug, sizeof *xug); if (error) { free(xug, M_TEMP); return (error); } unp_list = malloc(n * sizeof *unp_list, M_TEMP, M_WAITOK); UNP_LINK_RLOCK(); for (unp = LIST_FIRST(head), i = 0; unp && i < n; unp = LIST_NEXT(unp, unp_link)) { UNP_PCB_LOCK(unp); if (unp->unp_gencnt <= gencnt) { if (cr_cansee(req->td->td_ucred, unp->unp_socket->so_cred)) { UNP_PCB_UNLOCK(unp); continue; } unp_list[i++] = unp; unp_pcb_hold(unp); } UNP_PCB_UNLOCK(unp); } UNP_LINK_RUNLOCK(); n = i; /* In case we lost some during malloc. */ error = 0; xu = malloc(sizeof(*xu), M_TEMP, M_WAITOK | M_ZERO); for (i = 0; i < n; i++) { unp = unp_list[i]; UNP_PCB_LOCK(unp); if (unp_pcb_rele(unp)) continue; if (unp->unp_gencnt <= gencnt) { xu->xu_len = sizeof *xu; xu->xu_unpp = (uintptr_t)unp; /* * XXX - need more locking here to protect against * connect/disconnect races for SMP. */ if (unp->unp_addr != NULL) bcopy(unp->unp_addr, &xu->xu_addr, unp->unp_addr->sun_len); else bzero(&xu->xu_addr, sizeof(xu->xu_addr)); if (unp->unp_conn != NULL && unp->unp_conn->unp_addr != NULL) bcopy(unp->unp_conn->unp_addr, &xu->xu_caddr, unp->unp_conn->unp_addr->sun_len); else bzero(&xu->xu_caddr, sizeof(xu->xu_caddr)); xu->unp_vnode = (uintptr_t)unp->unp_vnode; xu->unp_conn = (uintptr_t)unp->unp_conn; xu->xu_firstref = (uintptr_t)LIST_FIRST(&unp->unp_refs); xu->xu_nextref = (uintptr_t)LIST_NEXT(unp, unp_reflink); xu->unp_gencnt = unp->unp_gencnt; sotoxsocket(unp->unp_socket, &xu->xu_socket); UNP_PCB_UNLOCK(unp); error = SYSCTL_OUT(req, xu, sizeof *xu); } else { UNP_PCB_UNLOCK(unp); } } free(xu, M_TEMP); if (!error) { /* * Give the user an updated idea of our state. If the * generation differs from what we told her before, she knows * that something happened while we were processing this * request, and it might be necessary to retry. */ xug->xug_gen = unp_gencnt; xug->xug_sogen = so_gencnt; xug->xug_count = unp_count; error = SYSCTL_OUT(req, xug, sizeof *xug); } free(unp_list, M_TEMP); free(xug, M_TEMP); return (error); } SYSCTL_PROC(_net_local_dgram, OID_AUTO, pcblist, CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, (void *)(intptr_t)SOCK_DGRAM, 0, unp_pcblist, "S,xunpcb", "List of active local datagram sockets"); SYSCTL_PROC(_net_local_stream, OID_AUTO, pcblist, CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, (void *)(intptr_t)SOCK_STREAM, 0, unp_pcblist, "S,xunpcb", "List of active local stream sockets"); SYSCTL_PROC(_net_local_seqpacket, OID_AUTO, pcblist, CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, (void *)(intptr_t)SOCK_SEQPACKET, 0, unp_pcblist, "S,xunpcb", "List of active local seqpacket sockets"); static void unp_shutdown(struct unpcb *unp) { struct unpcb *unp2; struct socket *so; UNP_PCB_LOCK_ASSERT(unp); unp2 = unp->unp_conn; if ((unp->unp_socket->so_type == SOCK_STREAM || (unp->unp_socket->so_type == SOCK_SEQPACKET)) && unp2 != NULL) { so = unp2->unp_socket; if (so != NULL) socantrcvmore(so); } } static void unp_drop(struct unpcb *unp) { struct socket *so = unp->unp_socket; struct unpcb *unp2; /* * Regardless of whether the socket's peer dropped the connection * with this socket by aborting or disconnecting, POSIX requires * that ECONNRESET is returned. */ UNP_PCB_LOCK(unp); if (so) so->so_error = ECONNRESET; if ((unp2 = unp_pcb_lock_peer(unp)) != NULL) { /* Last reference dropped in unp_disconnect(). */ unp_pcb_rele_notlast(unp); unp_disconnect(unp, unp2); } else if (!unp_pcb_rele(unp)) { UNP_PCB_UNLOCK(unp); } } static void unp_freerights(struct filedescent **fdep, int fdcount) { struct file *fp; int i; KASSERT(fdcount > 0, ("%s: fdcount %d", __func__, fdcount)); for (i = 0; i < fdcount; i++) { fp = fdep[i]->fde_file; filecaps_free(&fdep[i]->fde_caps); unp_discard(fp); } free(fdep[0], M_FILECAPS); } static int unp_externalize(struct mbuf *control, struct mbuf **controlp, int flags) { struct thread *td = curthread; /* XXX */ struct cmsghdr *cm = mtod(control, struct cmsghdr *); int i; int *fdp; struct filedesc *fdesc = td->td_proc->p_fd; struct filedescent **fdep; void *data; socklen_t clen = control->m_len, datalen; int error, newfds; u_int newlen; UNP_LINK_UNLOCK_ASSERT(); error = 0; if (controlp != NULL) /* controlp == NULL => free control messages */ *controlp = NULL; while (cm != NULL) { if (sizeof(*cm) > clen || cm->cmsg_len > clen) { error = EINVAL; break; } data = CMSG_DATA(cm); datalen = (caddr_t)cm + cm->cmsg_len - (caddr_t)data; if (cm->cmsg_level == SOL_SOCKET && cm->cmsg_type == SCM_RIGHTS) { newfds = datalen / sizeof(*fdep); if (newfds == 0) goto next; fdep = data; /* If we're not outputting the descriptors free them. */ if (error || controlp == NULL) { unp_freerights(fdep, newfds); goto next; } FILEDESC_XLOCK(fdesc); /* * Now change each pointer to an fd in the global * table to an integer that is the index to the local * fd table entry that we set up to point to the * global one we are transferring. */ newlen = newfds * sizeof(int); *controlp = sbcreatecontrol(NULL, newlen, SCM_RIGHTS, SOL_SOCKET); if (*controlp == NULL) { FILEDESC_XUNLOCK(fdesc); error = E2BIG; unp_freerights(fdep, newfds); goto next; } fdp = (int *) CMSG_DATA(mtod(*controlp, struct cmsghdr *)); if (fdallocn(td, 0, fdp, newfds) != 0) { FILEDESC_XUNLOCK(fdesc); error = EMSGSIZE; unp_freerights(fdep, newfds); m_freem(*controlp); *controlp = NULL; goto next; } for (i = 0; i < newfds; i++, fdp++) { _finstall(fdesc, fdep[i]->fde_file, *fdp, (flags & MSG_CMSG_CLOEXEC) != 0 ? O_CLOEXEC : 0, &fdep[i]->fde_caps); unp_externalize_fp(fdep[i]->fde_file); } /* * The new type indicates that the mbuf data refers to * kernel resources that may need to be released before * the mbuf is freed. */ m_chtype(*controlp, MT_EXTCONTROL); FILEDESC_XUNLOCK(fdesc); free(fdep[0], M_FILECAPS); } else { /* We can just copy anything else across. */ if (error || controlp == NULL) goto next; *controlp = sbcreatecontrol(NULL, datalen, cm->cmsg_type, cm->cmsg_level); if (*controlp == NULL) { error = ENOBUFS; goto next; } bcopy(data, CMSG_DATA(mtod(*controlp, struct cmsghdr *)), datalen); } controlp = &(*controlp)->m_next; next: if (CMSG_SPACE(datalen) < clen) { clen -= CMSG_SPACE(datalen); cm = (struct cmsghdr *) ((caddr_t)cm + CMSG_SPACE(datalen)); } else { clen = 0; cm = NULL; } } m_freem(control); return (error); } static void unp_zone_change(void *tag) { uma_zone_set_max(unp_zone, maxsockets); } #ifdef INVARIANTS static void unp_zdtor(void *mem, int size __unused, void *arg __unused) { struct unpcb *unp; unp = mem; KASSERT(LIST_EMPTY(&unp->unp_refs), ("%s: unpcb %p has lingering refs", __func__, unp)); KASSERT(unp->unp_socket == NULL, ("%s: unpcb %p has socket backpointer", __func__, unp)); KASSERT(unp->unp_vnode == NULL, ("%s: unpcb %p has vnode references", __func__, unp)); KASSERT(unp->unp_conn == NULL, ("%s: unpcb %p is still connected", __func__, unp)); KASSERT(unp->unp_addr == NULL, ("%s: unpcb %p has leaked addr", __func__, unp)); } #endif static void unp_init(void) { uma_dtor dtor; #ifdef VIMAGE if (!IS_DEFAULT_VNET(curvnet)) return; #endif #ifdef INVARIANTS dtor = unp_zdtor; #else dtor = NULL; #endif unp_zone = uma_zcreate("unpcb", sizeof(struct unpcb), NULL, dtor, NULL, NULL, UMA_ALIGN_CACHE, 0); uma_zone_set_max(unp_zone, maxsockets); uma_zone_set_warning(unp_zone, "kern.ipc.maxsockets limit reached"); EVENTHANDLER_REGISTER(maxsockets_change, unp_zone_change, NULL, EVENTHANDLER_PRI_ANY); LIST_INIT(&unp_dhead); LIST_INIT(&unp_shead); LIST_INIT(&unp_sphead); SLIST_INIT(&unp_defers); TIMEOUT_TASK_INIT(taskqueue_thread, &unp_gc_task, 0, unp_gc, NULL); TASK_INIT(&unp_defer_task, 0, unp_process_defers, NULL); UNP_LINK_LOCK_INIT(); UNP_DEFERRED_LOCK_INIT(); } static void unp_internalize_cleanup_rights(struct mbuf *control) { struct cmsghdr *cp; struct mbuf *m; void *data; socklen_t datalen; for (m = control; m != NULL; m = m->m_next) { cp = mtod(m, struct cmsghdr *); if (cp->cmsg_level != SOL_SOCKET || cp->cmsg_type != SCM_RIGHTS) continue; data = CMSG_DATA(cp); datalen = (caddr_t)cp + cp->cmsg_len - (caddr_t)data; unp_freerights(data, datalen / sizeof(struct filedesc *)); } } static int unp_internalize(struct mbuf **controlp, struct thread *td) { struct mbuf *control, **initial_controlp; struct proc *p; struct filedesc *fdesc; struct bintime *bt; struct cmsghdr *cm; struct cmsgcred *cmcred; struct filedescent *fde, **fdep, *fdev; struct file *fp; struct timeval *tv; struct timespec *ts; void *data; socklen_t clen, datalen; int i, j, error, *fdp, oldfds; u_int newlen; UNP_LINK_UNLOCK_ASSERT(); p = td->td_proc; fdesc = p->p_fd; error = 0; control = *controlp; clen = control->m_len; *controlp = NULL; initial_controlp = controlp; for (cm = mtod(control, struct cmsghdr *); cm != NULL;) { if (sizeof(*cm) > clen || cm->cmsg_level != SOL_SOCKET || cm->cmsg_len > clen || cm->cmsg_len < sizeof(*cm)) { error = EINVAL; goto out; } data = CMSG_DATA(cm); datalen = (caddr_t)cm + cm->cmsg_len - (caddr_t)data; switch (cm->cmsg_type) { /* * Fill in credential information. */ case SCM_CREDS: *controlp = sbcreatecontrol(NULL, sizeof(*cmcred), SCM_CREDS, SOL_SOCKET); if (*controlp == NULL) { error = ENOBUFS; goto out; } cmcred = (struct cmsgcred *) CMSG_DATA(mtod(*controlp, struct cmsghdr *)); cmcred->cmcred_pid = p->p_pid; cmcred->cmcred_uid = td->td_ucred->cr_ruid; cmcred->cmcred_gid = td->td_ucred->cr_rgid; cmcred->cmcred_euid = td->td_ucred->cr_uid; cmcred->cmcred_ngroups = MIN(td->td_ucred->cr_ngroups, CMGROUP_MAX); for (i = 0; i < cmcred->cmcred_ngroups; i++) cmcred->cmcred_groups[i] = td->td_ucred->cr_groups[i]; break; case SCM_RIGHTS: oldfds = datalen / sizeof (int); if (oldfds == 0) break; /* * Check that all the FDs passed in refer to legal * files. If not, reject the entire operation. */ fdp = data; FILEDESC_SLOCK(fdesc); for (i = 0; i < oldfds; i++, fdp++) { fp = fget_locked(fdesc, *fdp); if (fp == NULL) { FILEDESC_SUNLOCK(fdesc); error = EBADF; goto out; } if (!(fp->f_ops->fo_flags & DFLAG_PASSABLE)) { FILEDESC_SUNLOCK(fdesc); error = EOPNOTSUPP; goto out; } } /* * Now replace the integer FDs with pointers to the * file structure and capability rights. */ newlen = oldfds * sizeof(fdep[0]); *controlp = sbcreatecontrol(NULL, newlen, SCM_RIGHTS, SOL_SOCKET); if (*controlp == NULL) { FILEDESC_SUNLOCK(fdesc); error = E2BIG; goto out; } fdp = data; for (i = 0; i < oldfds; i++, fdp++) { if (!fhold(fdesc->fd_ofiles[*fdp].fde_file)) { fdp = data; for (j = 0; j < i; j++, fdp++) { fdrop(fdesc->fd_ofiles[*fdp]. fde_file, td); } FILEDESC_SUNLOCK(fdesc); error = EBADF; goto out; } } fdp = data; fdep = (struct filedescent **) CMSG_DATA(mtod(*controlp, struct cmsghdr *)); fdev = malloc(sizeof(*fdev) * oldfds, M_FILECAPS, M_WAITOK); for (i = 0; i < oldfds; i++, fdev++, fdp++) { fde = &fdesc->fd_ofiles[*fdp]; fdep[i] = fdev; fdep[i]->fde_file = fde->fde_file; filecaps_copy(&fde->fde_caps, &fdep[i]->fde_caps, true); unp_internalize_fp(fdep[i]->fde_file); } FILEDESC_SUNLOCK(fdesc); break; case SCM_TIMESTAMP: *controlp = sbcreatecontrol(NULL, sizeof(*tv), SCM_TIMESTAMP, SOL_SOCKET); if (*controlp == NULL) { error = ENOBUFS; goto out; } tv = (struct timeval *) CMSG_DATA(mtod(*controlp, struct cmsghdr *)); microtime(tv); break; case SCM_BINTIME: *controlp = sbcreatecontrol(NULL, sizeof(*bt), SCM_BINTIME, SOL_SOCKET); if (*controlp == NULL) { error = ENOBUFS; goto out; } bt = (struct bintime *) CMSG_DATA(mtod(*controlp, struct cmsghdr *)); bintime(bt); break; case SCM_REALTIME: *controlp = sbcreatecontrol(NULL, sizeof(*ts), SCM_REALTIME, SOL_SOCKET); if (*controlp == NULL) { error = ENOBUFS; goto out; } ts = (struct timespec *) CMSG_DATA(mtod(*controlp, struct cmsghdr *)); nanotime(ts); break; case SCM_MONOTONIC: *controlp = sbcreatecontrol(NULL, sizeof(*ts), SCM_MONOTONIC, SOL_SOCKET); if (*controlp == NULL) { error = ENOBUFS; goto out; } ts = (struct timespec *) CMSG_DATA(mtod(*controlp, struct cmsghdr *)); nanouptime(ts); break; default: error = EINVAL; goto out; } if (*controlp != NULL) controlp = &(*controlp)->m_next; if (CMSG_SPACE(datalen) < clen) { clen -= CMSG_SPACE(datalen); cm = (struct cmsghdr *) ((caddr_t)cm + CMSG_SPACE(datalen)); } else { clen = 0; cm = NULL; } } out: if (error != 0 && initial_controlp != NULL) unp_internalize_cleanup_rights(*initial_controlp); m_freem(control); return (error); } static struct mbuf * unp_addsockcred(struct thread *td, struct mbuf *control, int mode) { struct mbuf *m, *n, *n_prev; const struct cmsghdr *cm; int ngroups, i, cmsgtype; size_t ctrlsz; ngroups = MIN(td->td_ucred->cr_ngroups, CMGROUP_MAX); if (mode & UNP_WANTCRED_ALWAYS) { ctrlsz = SOCKCRED2SIZE(ngroups); cmsgtype = SCM_CREDS2; } else { ctrlsz = SOCKCREDSIZE(ngroups); cmsgtype = SCM_CREDS; } m = sbcreatecontrol(NULL, ctrlsz, cmsgtype, SOL_SOCKET); if (m == NULL) return (control); if (mode & UNP_WANTCRED_ALWAYS) { struct sockcred2 *sc; sc = (void *)CMSG_DATA(mtod(m, struct cmsghdr *)); sc->sc_version = 0; sc->sc_pid = td->td_proc->p_pid; sc->sc_uid = td->td_ucred->cr_ruid; sc->sc_euid = td->td_ucred->cr_uid; sc->sc_gid = td->td_ucred->cr_rgid; sc->sc_egid = td->td_ucred->cr_gid; sc->sc_ngroups = ngroups; for (i = 0; i < sc->sc_ngroups; i++) sc->sc_groups[i] = td->td_ucred->cr_groups[i]; } else { struct sockcred *sc; sc = (void *)CMSG_DATA(mtod(m, struct cmsghdr *)); sc->sc_uid = td->td_ucred->cr_ruid; sc->sc_euid = td->td_ucred->cr_uid; sc->sc_gid = td->td_ucred->cr_rgid; sc->sc_egid = td->td_ucred->cr_gid; sc->sc_ngroups = ngroups; for (i = 0; i < sc->sc_ngroups; i++) sc->sc_groups[i] = td->td_ucred->cr_groups[i]; } /* * Unlink SCM_CREDS control messages (struct cmsgcred), since just * created SCM_CREDS control message (struct sockcred) has another * format. */ if (control != NULL && cmsgtype == SCM_CREDS) for (n = control, n_prev = NULL; n != NULL;) { cm = mtod(n, struct cmsghdr *); if (cm->cmsg_level == SOL_SOCKET && cm->cmsg_type == SCM_CREDS) { if (n_prev == NULL) control = n->m_next; else n_prev->m_next = n->m_next; n = m_free(n); } else { n_prev = n; n = n->m_next; } } /* Prepend it to the head. */ m->m_next = control; return (m); } static struct unpcb * fptounp(struct file *fp) { struct socket *so; if (fp->f_type != DTYPE_SOCKET) return (NULL); if ((so = fp->f_data) == NULL) return (NULL); if (so->so_proto->pr_domain != &localdomain) return (NULL); return sotounpcb(so); } static void unp_discard(struct file *fp) { struct unp_defer *dr; if (unp_externalize_fp(fp)) { dr = malloc(sizeof(*dr), M_TEMP, M_WAITOK); dr->ud_fp = fp; UNP_DEFERRED_LOCK(); SLIST_INSERT_HEAD(&unp_defers, dr, ud_link); UNP_DEFERRED_UNLOCK(); atomic_add_int(&unp_defers_count, 1); taskqueue_enqueue(taskqueue_thread, &unp_defer_task); } else closef_nothread(fp); } static void unp_process_defers(void *arg __unused, int pending) { struct unp_defer *dr; SLIST_HEAD(, unp_defer) drl; int count; SLIST_INIT(&drl); for (;;) { UNP_DEFERRED_LOCK(); if (SLIST_FIRST(&unp_defers) == NULL) { UNP_DEFERRED_UNLOCK(); break; } SLIST_SWAP(&unp_defers, &drl, unp_defer); UNP_DEFERRED_UNLOCK(); count = 0; while ((dr = SLIST_FIRST(&drl)) != NULL) { SLIST_REMOVE_HEAD(&drl, ud_link); closef_nothread(dr->ud_fp); free(dr, M_TEMP); count++; } atomic_add_int(&unp_defers_count, -count); } } static void unp_internalize_fp(struct file *fp) { struct unpcb *unp; UNP_LINK_WLOCK(); if ((unp = fptounp(fp)) != NULL) { unp->unp_file = fp; unp->unp_msgcount++; } unp_rights++; UNP_LINK_WUNLOCK(); } static int unp_externalize_fp(struct file *fp) { struct unpcb *unp; int ret; UNP_LINK_WLOCK(); if ((unp = fptounp(fp)) != NULL) { unp->unp_msgcount--; ret = 1; } else ret = 0; unp_rights--; UNP_LINK_WUNLOCK(); return (ret); } /* * unp_defer indicates whether additional work has been defered for a future * pass through unp_gc(). It is thread local and does not require explicit * synchronization. */ static int unp_marked; static void unp_remove_dead_ref(struct filedescent **fdep, int fdcount) { struct unpcb *unp; struct file *fp; int i; /* * This function can only be called from the gc task. */ KASSERT(taskqueue_member(taskqueue_thread, curthread) != 0, ("%s: not on gc callout", __func__)); UNP_LINK_LOCK_ASSERT(); for (i = 0; i < fdcount; i++) { fp = fdep[i]->fde_file; if ((unp = fptounp(fp)) == NULL) continue; if ((unp->unp_gcflag & UNPGC_DEAD) == 0) continue; unp->unp_gcrefs--; } } static void unp_restore_undead_ref(struct filedescent **fdep, int fdcount) { struct unpcb *unp; struct file *fp; int i; /* * This function can only be called from the gc task. */ KASSERT(taskqueue_member(taskqueue_thread, curthread) != 0, ("%s: not on gc callout", __func__)); UNP_LINK_LOCK_ASSERT(); for (i = 0; i < fdcount; i++) { fp = fdep[i]->fde_file; if ((unp = fptounp(fp)) == NULL) continue; if ((unp->unp_gcflag & UNPGC_DEAD) == 0) continue; unp->unp_gcrefs++; unp_marked++; } } static void unp_gc_scan(struct unpcb *unp, void (*op)(struct filedescent **, int)) { struct socket *so, *soa; so = unp->unp_socket; SOCK_LOCK(so); if (SOLISTENING(so)) { /* * Mark all sockets in our accept queue. */ TAILQ_FOREACH(soa, &so->sol_comp, so_list) { if (sotounpcb(soa)->unp_gcflag & UNPGC_IGNORE_RIGHTS) continue; SOCKBUF_LOCK(&soa->so_rcv); unp_scan(soa->so_rcv.sb_mb, op); SOCKBUF_UNLOCK(&soa->so_rcv); } } else { /* * Mark all sockets we reference with RIGHTS. */ if ((unp->unp_gcflag & UNPGC_IGNORE_RIGHTS) == 0) { SOCKBUF_LOCK(&so->so_rcv); unp_scan(so->so_rcv.sb_mb, op); SOCKBUF_UNLOCK(&so->so_rcv); } } SOCK_UNLOCK(so); } static int unp_recycled; SYSCTL_INT(_net_local, OID_AUTO, recycled, CTLFLAG_RD, &unp_recycled, 0, "Number of unreachable sockets claimed by the garbage collector."); static int unp_taskcount; SYSCTL_INT(_net_local, OID_AUTO, taskcount, CTLFLAG_RD, &unp_taskcount, 0, "Number of times the garbage collector has run."); SYSCTL_UINT(_net_local, OID_AUTO, sockcount, CTLFLAG_RD, &unp_count, 0, "Number of active local sockets."); static void unp_gc(__unused void *arg, int pending) { struct unp_head *heads[] = { &unp_dhead, &unp_shead, &unp_sphead, NULL }; struct unp_head **head; struct unp_head unp_deadhead; /* List of potentially-dead sockets. */ struct file *f, **unref; struct unpcb *unp, *unptmp; int i, total, unp_unreachable; LIST_INIT(&unp_deadhead); unp_taskcount++; UNP_LINK_RLOCK(); /* * First determine which sockets may be in cycles. */ unp_unreachable = 0; for (head = heads; *head != NULL; head++) LIST_FOREACH(unp, *head, unp_link) { KASSERT((unp->unp_gcflag & ~UNPGC_IGNORE_RIGHTS) == 0, ("%s: unp %p has unexpected gc flags 0x%x", __func__, unp, (unsigned int)unp->unp_gcflag)); f = unp->unp_file; /* * Check for an unreachable socket potentially in a * cycle. It must be in a queue as indicated by * msgcount, and this must equal the file reference * count. Note that when msgcount is 0 the file is * NULL. */ if (f != NULL && unp->unp_msgcount != 0 && refcount_load(&f->f_count) == unp->unp_msgcount) { LIST_INSERT_HEAD(&unp_deadhead, unp, unp_dead); unp->unp_gcflag |= UNPGC_DEAD; unp->unp_gcrefs = unp->unp_msgcount; unp_unreachable++; } } /* * Scan all sockets previously marked as potentially being in a cycle * and remove the references each socket holds on any UNPGC_DEAD * sockets in its queue. After this step, all remaining references on * sockets marked UNPGC_DEAD should not be part of any cycle. */ LIST_FOREACH(unp, &unp_deadhead, unp_dead) unp_gc_scan(unp, unp_remove_dead_ref); /* * If a socket still has a non-negative refcount, it cannot be in a * cycle. In this case increment refcount of all children iteratively. * Stop the scan once we do a complete loop without discovering * a new reachable socket. */ do { unp_marked = 0; LIST_FOREACH_SAFE(unp, &unp_deadhead, unp_dead, unptmp) if (unp->unp_gcrefs > 0) { unp->unp_gcflag &= ~UNPGC_DEAD; LIST_REMOVE(unp, unp_dead); KASSERT(unp_unreachable > 0, ("%s: unp_unreachable underflow.", __func__)); unp_unreachable--; unp_gc_scan(unp, unp_restore_undead_ref); } } while (unp_marked); UNP_LINK_RUNLOCK(); if (unp_unreachable == 0) return; /* * Allocate space for a local array of dead unpcbs. * TODO: can this path be simplified by instead using the local * dead list at unp_deadhead, after taking out references * on the file object and/or unpcb and dropping the link lock? */ unref = malloc(unp_unreachable * sizeof(struct file *), M_TEMP, M_WAITOK); /* * Iterate looking for sockets which have been specifically marked * as unreachable and store them locally. */ UNP_LINK_RLOCK(); total = 0; LIST_FOREACH(unp, &unp_deadhead, unp_dead) { KASSERT((unp->unp_gcflag & UNPGC_DEAD) != 0, ("%s: unp %p not marked UNPGC_DEAD", __func__, unp)); unp->unp_gcflag &= ~UNPGC_DEAD; f = unp->unp_file; if (unp->unp_msgcount == 0 || f == NULL || refcount_load(&f->f_count) != unp->unp_msgcount || !fhold(f)) continue; unref[total++] = f; KASSERT(total <= unp_unreachable, ("%s: incorrect unreachable count.", __func__)); } UNP_LINK_RUNLOCK(); /* * Now flush all sockets, free'ing rights. This will free the * struct files associated with these sockets but leave each socket * with one remaining ref. */ for (i = 0; i < total; i++) { struct socket *so; so = unref[i]->f_data; CURVNET_SET(so->so_vnet); sorflush(so); CURVNET_RESTORE(); } /* * And finally release the sockets so they can be reclaimed. */ for (i = 0; i < total; i++) fdrop(unref[i], NULL); unp_recycled += total; free(unref, M_TEMP); } static void unp_dispose_mbuf(struct mbuf *m) { if (m) unp_scan(m, unp_freerights); } /* * Synchronize against unp_gc, which can trip over data as we are freeing it. */ static void unp_dispose(struct socket *so) { struct unpcb *unp; unp = sotounpcb(so); UNP_LINK_WLOCK(); unp->unp_gcflag |= UNPGC_IGNORE_RIGHTS; UNP_LINK_WUNLOCK(); if (!SOLISTENING(so)) unp_dispose_mbuf(so->so_rcv.sb_mb); } static void unp_scan(struct mbuf *m0, void (*op)(struct filedescent **, int)) { struct mbuf *m; struct cmsghdr *cm; void *data; socklen_t clen, datalen; while (m0 != NULL) { for (m = m0; m; m = m->m_next) { if (m->m_type != MT_CONTROL) continue; cm = mtod(m, struct cmsghdr *); clen = m->m_len; while (cm != NULL) { if (sizeof(*cm) > clen || cm->cmsg_len > clen) break; data = CMSG_DATA(cm); datalen = (caddr_t)cm + cm->cmsg_len - (caddr_t)data; if (cm->cmsg_level == SOL_SOCKET && cm->cmsg_type == SCM_RIGHTS) { (*op)(data, datalen / sizeof(struct filedescent *)); } if (CMSG_SPACE(datalen) < clen) { clen -= CMSG_SPACE(datalen); cm = (struct cmsghdr *) ((caddr_t)cm + CMSG_SPACE(datalen)); } else { clen = 0; cm = NULL; } } } m0 = m0->m_nextpkt; } } /* * A helper function called by VFS before socket-type vnode reclamation. * For an active vnode it clears unp_vnode pointer and decrements unp_vnode * use count. */ void vfs_unp_reclaim(struct vnode *vp) { struct unpcb *unp; int active; struct mtx *vplock; ASSERT_VOP_ELOCKED(vp, "vfs_unp_reclaim"); KASSERT(vp->v_type == VSOCK, ("vfs_unp_reclaim: vp->v_type != VSOCK")); active = 0; vplock = mtx_pool_find(mtxpool_sleep, vp); mtx_lock(vplock); VOP_UNP_CONNECT(vp, &unp); if (unp == NULL) goto done; UNP_PCB_LOCK(unp); if (unp->unp_vnode == vp) { VOP_UNP_DETACH(vp); unp->unp_vnode = NULL; active = 1; } UNP_PCB_UNLOCK(unp); done: mtx_unlock(vplock); if (active) vunref(vp); } #ifdef DDB static void db_print_indent(int indent) { int i; for (i = 0; i < indent; i++) db_printf(" "); } static void db_print_unpflags(int unp_flags) { int comma; comma = 0; if (unp_flags & UNP_HAVEPC) { db_printf("%sUNP_HAVEPC", comma ? ", " : ""); comma = 1; } if (unp_flags & UNP_WANTCRED_ALWAYS) { db_printf("%sUNP_WANTCRED_ALWAYS", comma ? ", " : ""); comma = 1; } if (unp_flags & UNP_WANTCRED_ONESHOT) { db_printf("%sUNP_WANTCRED_ONESHOT", comma ? ", " : ""); comma = 1; } if (unp_flags & UNP_CONNWAIT) { db_printf("%sUNP_CONNWAIT", comma ? ", " : ""); comma = 1; } if (unp_flags & UNP_CONNECTING) { db_printf("%sUNP_CONNECTING", comma ? ", " : ""); comma = 1; } if (unp_flags & UNP_BINDING) { db_printf("%sUNP_BINDING", comma ? ", " : ""); comma = 1; } } static void db_print_xucred(int indent, struct xucred *xu) { int comma, i; db_print_indent(indent); db_printf("cr_version: %u cr_uid: %u cr_pid: %d cr_ngroups: %d\n", xu->cr_version, xu->cr_uid, xu->cr_pid, xu->cr_ngroups); db_print_indent(indent); db_printf("cr_groups: "); comma = 0; for (i = 0; i < xu->cr_ngroups; i++) { db_printf("%s%u", comma ? ", " : "", xu->cr_groups[i]); comma = 1; } db_printf("\n"); } static void db_print_unprefs(int indent, struct unp_head *uh) { struct unpcb *unp; int counter; counter = 0; LIST_FOREACH(unp, uh, unp_reflink) { if (counter % 4 == 0) db_print_indent(indent); db_printf("%p ", unp); if (counter % 4 == 3) db_printf("\n"); counter++; } if (counter != 0 && counter % 4 != 0) db_printf("\n"); } DB_SHOW_COMMAND(unpcb, db_show_unpcb) { struct unpcb *unp; if (!have_addr) { db_printf("usage: show unpcb \n"); return; } unp = (struct unpcb *)addr; db_printf("unp_socket: %p unp_vnode: %p\n", unp->unp_socket, unp->unp_vnode); db_printf("unp_ino: %ju unp_conn: %p\n", (uintmax_t)unp->unp_ino, unp->unp_conn); db_printf("unp_refs:\n"); db_print_unprefs(2, &unp->unp_refs); /* XXXRW: Would be nice to print the full address, if any. */ db_printf("unp_addr: %p\n", unp->unp_addr); db_printf("unp_gencnt: %llu\n", (unsigned long long)unp->unp_gencnt); db_printf("unp_flags: %x (", unp->unp_flags); db_print_unpflags(unp->unp_flags); db_printf(")\n"); db_printf("unp_peercred:\n"); db_print_xucred(2, &unp->unp_peercred); db_printf("unp_refcount: %u\n", unp->unp_refcount); } #endif diff --git a/sys/netgraph/bluetooth/socket/ng_btsocket_l2cap.c b/sys/netgraph/bluetooth/socket/ng_btsocket_l2cap.c index 2bd15ac35895..cd620fe3aef9 100644 --- a/sys/netgraph/bluetooth/socket/ng_btsocket_l2cap.c +++ b/sys/netgraph/bluetooth/socket/ng_btsocket_l2cap.c @@ -1,2965 +1,2964 @@ /* * ng_btsocket_l2cap.c */ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2001-2002 Maksim Yevmenkin * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $Id: ng_btsocket_l2cap.c,v 1.16 2003/09/14 23:29:06 max Exp $ * $FreeBSD$ */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* MALLOC define */ #ifdef NG_SEPARATE_MALLOC static MALLOC_DEFINE(M_NETGRAPH_BTSOCKET_L2CAP, "netgraph_btsocks_l2cap", "Netgraph Bluetooth L2CAP sockets"); #else #define M_NETGRAPH_BTSOCKET_L2CAP M_NETGRAPH #endif /* NG_SEPARATE_MALLOC */ /* Netgraph node methods */ static ng_constructor_t ng_btsocket_l2cap_node_constructor; static ng_rcvmsg_t ng_btsocket_l2cap_node_rcvmsg; static ng_shutdown_t ng_btsocket_l2cap_node_shutdown; static ng_newhook_t ng_btsocket_l2cap_node_newhook; static ng_connect_t ng_btsocket_l2cap_node_connect; static ng_rcvdata_t ng_btsocket_l2cap_node_rcvdata; static ng_disconnect_t ng_btsocket_l2cap_node_disconnect; static void ng_btsocket_l2cap_input (void *, int); static void ng_btsocket_l2cap_rtclean (void *, int); /* Netgraph type descriptor */ static struct ng_type typestruct = { .version = NG_ABI_VERSION, .name = NG_BTSOCKET_L2CAP_NODE_TYPE, .constructor = ng_btsocket_l2cap_node_constructor, .rcvmsg = ng_btsocket_l2cap_node_rcvmsg, .shutdown = ng_btsocket_l2cap_node_shutdown, .newhook = ng_btsocket_l2cap_node_newhook, .connect = ng_btsocket_l2cap_node_connect, .rcvdata = ng_btsocket_l2cap_node_rcvdata, .disconnect = ng_btsocket_l2cap_node_disconnect, }; /* Globals */ extern int ifqmaxlen; static u_int32_t ng_btsocket_l2cap_debug_level; static node_p ng_btsocket_l2cap_node; static struct ng_bt_itemq ng_btsocket_l2cap_queue; static struct mtx ng_btsocket_l2cap_queue_mtx; static struct task ng_btsocket_l2cap_queue_task; static LIST_HEAD(, ng_btsocket_l2cap_pcb) ng_btsocket_l2cap_sockets; static struct mtx ng_btsocket_l2cap_sockets_mtx; static LIST_HEAD(, ng_btsocket_l2cap_rtentry) ng_btsocket_l2cap_rt; static struct mtx ng_btsocket_l2cap_rt_mtx; static struct task ng_btsocket_l2cap_rt_task; static struct timeval ng_btsocket_l2cap_lasttime; static int ng_btsocket_l2cap_curpps; /* Sysctl tree */ SYSCTL_DECL(_net_bluetooth_l2cap_sockets); static SYSCTL_NODE(_net_bluetooth_l2cap_sockets, OID_AUTO, seq, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Bluetooth SEQPACKET L2CAP sockets family"); SYSCTL_UINT(_net_bluetooth_l2cap_sockets_seq, OID_AUTO, debug_level, CTLFLAG_RW, &ng_btsocket_l2cap_debug_level, NG_BTSOCKET_WARN_LEVEL, "Bluetooth SEQPACKET L2CAP sockets debug level"); SYSCTL_UINT(_net_bluetooth_l2cap_sockets_seq, OID_AUTO, queue_len, CTLFLAG_RD, &ng_btsocket_l2cap_queue.len, 0, "Bluetooth SEQPACKET L2CAP sockets input queue length"); SYSCTL_UINT(_net_bluetooth_l2cap_sockets_seq, OID_AUTO, queue_maxlen, CTLFLAG_RD, &ng_btsocket_l2cap_queue.maxlen, 0, "Bluetooth SEQPACKET L2CAP sockets input queue max. length"); SYSCTL_UINT(_net_bluetooth_l2cap_sockets_seq, OID_AUTO, queue_drops, CTLFLAG_RD, &ng_btsocket_l2cap_queue.drops, 0, "Bluetooth SEQPACKET L2CAP sockets input queue drops"); /* Debug */ #define NG_BTSOCKET_L2CAP_INFO \ if (ng_btsocket_l2cap_debug_level >= NG_BTSOCKET_INFO_LEVEL && \ ppsratecheck(&ng_btsocket_l2cap_lasttime, &ng_btsocket_l2cap_curpps, 1)) \ printf #define NG_BTSOCKET_L2CAP_WARN \ if (ng_btsocket_l2cap_debug_level >= NG_BTSOCKET_WARN_LEVEL && \ ppsratecheck(&ng_btsocket_l2cap_lasttime, &ng_btsocket_l2cap_curpps, 1)) \ printf #define NG_BTSOCKET_L2CAP_ERR \ if (ng_btsocket_l2cap_debug_level >= NG_BTSOCKET_ERR_LEVEL && \ ppsratecheck(&ng_btsocket_l2cap_lasttime, &ng_btsocket_l2cap_curpps, 1)) \ printf #define NG_BTSOCKET_L2CAP_ALERT \ if (ng_btsocket_l2cap_debug_level >= NG_BTSOCKET_ALERT_LEVEL && \ ppsratecheck(&ng_btsocket_l2cap_lasttime, &ng_btsocket_l2cap_curpps, 1)) \ printf /* * Netgraph message processing routines */ static int ng_btsocket_l2cap_process_l2ca_con_req_rsp (struct ng_mesg *, ng_btsocket_l2cap_rtentry_p); static int ng_btsocket_l2cap_process_l2ca_con_rsp_rsp (struct ng_mesg *, ng_btsocket_l2cap_rtentry_p); static int ng_btsocket_l2cap_process_l2ca_con_ind (struct ng_mesg *, ng_btsocket_l2cap_rtentry_p); static int ng_btsocket_l2cap_process_l2ca_cfg_req_rsp (struct ng_mesg *, ng_btsocket_l2cap_rtentry_p); static int ng_btsocket_l2cap_process_l2ca_cfg_rsp_rsp (struct ng_mesg *, ng_btsocket_l2cap_rtentry_p); static int ng_btsocket_l2cap_process_l2ca_cfg_ind (struct ng_mesg *, ng_btsocket_l2cap_rtentry_p); static int ng_btsocket_l2cap_process_l2ca_discon_rsp (struct ng_mesg *, ng_btsocket_l2cap_rtentry_p); static int ng_btsocket_l2cap_process_l2ca_discon_ind (struct ng_mesg *, ng_btsocket_l2cap_rtentry_p); static int ng_btsocket_l2cap_process_l2ca_write_rsp (struct ng_mesg *, ng_btsocket_l2cap_rtentry_p); /* * Send L2CA_xxx messages to the lower layer */ static int ng_btsocket_l2cap_send_l2ca_con_req (ng_btsocket_l2cap_pcb_p); static int ng_btsocket_l2cap_send_l2ca_con_rsp_req (u_int32_t, ng_btsocket_l2cap_rtentry_p, bdaddr_p, int, int, int, int); static int ng_btsocket_l2cap_send_l2ca_cfg_req (ng_btsocket_l2cap_pcb_p); static int ng_btsocket_l2cap_send_l2ca_cfg_rsp (ng_btsocket_l2cap_pcb_p); static int ng_btsocket_l2cap_send_l2ca_discon_req (u_int32_t, ng_btsocket_l2cap_pcb_p); static int ng_btsocket_l2cap_send2 (ng_btsocket_l2cap_pcb_p); /* * Timeout processing routines */ static void ng_btsocket_l2cap_timeout (ng_btsocket_l2cap_pcb_p); static void ng_btsocket_l2cap_untimeout (ng_btsocket_l2cap_pcb_p); static void ng_btsocket_l2cap_process_timeout (void *); /* * Other stuff */ static ng_btsocket_l2cap_pcb_p ng_btsocket_l2cap_pcb_by_addr(bdaddr_p, int); static ng_btsocket_l2cap_pcb_p ng_btsocket_l2cap_pcb_by_token(u_int32_t); static ng_btsocket_l2cap_pcb_p ng_btsocket_l2cap_pcb_by_cid (bdaddr_p, int,int); static int ng_btsocket_l2cap_result2errno(int); static int ng_btsock_l2cap_addrtype_to_linktype(int addrtype); #define ng_btsocket_l2cap_wakeup_input_task() \ taskqueue_enqueue(taskqueue_swi_giant, &ng_btsocket_l2cap_queue_task) #define ng_btsocket_l2cap_wakeup_route_task() \ taskqueue_enqueue(taskqueue_swi_giant, &ng_btsocket_l2cap_rt_task) int ng_btsock_l2cap_addrtype_to_linktype(int addrtype) { switch(addrtype){ case BDADDR_LE_PUBLIC: return NG_HCI_LINK_LE_PUBLIC; case BDADDR_LE_RANDOM: return NG_HCI_LINK_LE_RANDOM; default: return NG_HCI_LINK_ACL; } } /***************************************************************************** ***************************************************************************** ** Netgraph node interface ***************************************************************************** *****************************************************************************/ /* * Netgraph node constructor. Do not allow to create node of this type. */ static int ng_btsocket_l2cap_node_constructor(node_p node) { return (EINVAL); } /* ng_btsocket_l2cap_node_constructor */ /* * Do local shutdown processing. Let old node go and create new fresh one. */ static int ng_btsocket_l2cap_node_shutdown(node_p node) { int error = 0; NG_NODE_UNREF(node); /* Create new node */ error = ng_make_node_common(&typestruct, &ng_btsocket_l2cap_node); if (error != 0) { NG_BTSOCKET_L2CAP_ALERT( "%s: Could not create Netgraph node, error=%d\n", __func__, error); ng_btsocket_l2cap_node = NULL; return (error); } error = ng_name_node(ng_btsocket_l2cap_node, NG_BTSOCKET_L2CAP_NODE_TYPE); if (error != 0) { NG_BTSOCKET_L2CAP_ALERT( "%s: Could not name Netgraph node, error=%d\n", __func__, error); NG_NODE_UNREF(ng_btsocket_l2cap_node); ng_btsocket_l2cap_node = NULL; return (error); } return (0); } /* ng_btsocket_l2cap_node_shutdown */ /* * We allow any hook to be connected to the node. */ static int ng_btsocket_l2cap_node_newhook(node_p node, hook_p hook, char const *name) { return (0); } /* ng_btsocket_l2cap_node_newhook */ /* * Just say "YEP, that's OK by me!" */ static int ng_btsocket_l2cap_node_connect(hook_p hook) { NG_HOOK_SET_PRIVATE(hook, NULL); NG_HOOK_REF(hook); /* Keep extra reference to the hook */ #if 0 NG_HOOK_FORCE_QUEUE(NG_HOOK_PEER(hook)); NG_HOOK_FORCE_QUEUE(hook); #endif return (0); } /* ng_btsocket_l2cap_node_connect */ /* * Hook disconnection. Schedule route cleanup task */ static int ng_btsocket_l2cap_node_disconnect(hook_p hook) { /* * If hook has private information than we must have this hook in * the routing table and must schedule cleaning for the routing table. * Otherwise hook was connected but we never got "hook_info" message, * so we have never added this hook to the routing table and it save * to just delete it. */ if (NG_HOOK_PRIVATE(hook) != NULL) return (ng_btsocket_l2cap_wakeup_route_task()); NG_HOOK_UNREF(hook); /* Remove extra reference */ return (0); } /* ng_btsocket_l2cap_node_disconnect */ /* * Process incoming messages */ static int ng_btsocket_l2cap_node_rcvmsg(node_p node, item_p item, hook_p hook) { struct ng_mesg *msg = NGI_MSG(item); /* item still has message */ int error = 0; if (msg != NULL && msg->header.typecookie == NGM_L2CAP_COOKIE) { mtx_lock(&ng_btsocket_l2cap_queue_mtx); if (NG_BT_ITEMQ_FULL(&ng_btsocket_l2cap_queue)) { NG_BTSOCKET_L2CAP_ERR( "%s: Input queue is full (msg)\n", __func__); NG_BT_ITEMQ_DROP(&ng_btsocket_l2cap_queue); NG_FREE_ITEM(item); error = ENOBUFS; } else { if (hook != NULL) { NG_HOOK_REF(hook); NGI_SET_HOOK(item, hook); } NG_BT_ITEMQ_ENQUEUE(&ng_btsocket_l2cap_queue, item); error = ng_btsocket_l2cap_wakeup_input_task(); } mtx_unlock(&ng_btsocket_l2cap_queue_mtx); } else { NG_FREE_ITEM(item); error = EINVAL; } return (error); } /* ng_btsocket_l2cap_node_rcvmsg */ /* * Receive data on a hook */ static int ng_btsocket_l2cap_node_rcvdata(hook_p hook, item_p item) { int error = 0; mtx_lock(&ng_btsocket_l2cap_queue_mtx); if (NG_BT_ITEMQ_FULL(&ng_btsocket_l2cap_queue)) { NG_BTSOCKET_L2CAP_ERR( "%s: Input queue is full (data)\n", __func__); NG_BT_ITEMQ_DROP(&ng_btsocket_l2cap_queue); NG_FREE_ITEM(item); error = ENOBUFS; } else { NG_HOOK_REF(hook); NGI_SET_HOOK(item, hook); NG_BT_ITEMQ_ENQUEUE(&ng_btsocket_l2cap_queue, item); error = ng_btsocket_l2cap_wakeup_input_task(); } mtx_unlock(&ng_btsocket_l2cap_queue_mtx); return (error); } /* ng_btsocket_l2cap_node_rcvdata */ /* * Process L2CA_Connect respose. Socket layer must have initiated connection, * so we have to have a socket associated with message token. */ static int ng_btsocket_l2cap_process_l2ca_con_req_rsp(struct ng_mesg *msg, ng_btsocket_l2cap_rtentry_p rt) { ng_l2cap_l2ca_con_op *op = NULL; ng_btsocket_l2cap_pcb_t *pcb = NULL; int error = 0; if (msg->header.arglen != sizeof(*op)) return (EMSGSIZE); op = (ng_l2cap_l2ca_con_op *)(msg->data); mtx_lock(&ng_btsocket_l2cap_sockets_mtx); /* Look for the socket with the token */ pcb = ng_btsocket_l2cap_pcb_by_token(msg->header.token); if (pcb == NULL) { mtx_unlock(&ng_btsocket_l2cap_sockets_mtx); return (ENOENT); } mtx_lock(&pcb->pcb_mtx); NG_BTSOCKET_L2CAP_INFO( "%s: Got L2CA_Connect response, token=%d, src bdaddr=%x:%x:%x:%x:%x:%x, " \ "dst bdaddr=%x:%x:%x:%x:%x:%x, psm=%d, lcid=%d, result=%d, status=%d, " \ "state=%d\n", __func__, msg->header.token, pcb->src.b[5], pcb->src.b[4], pcb->src.b[3], pcb->src.b[2], pcb->src.b[1], pcb->src.b[0], pcb->dst.b[5], pcb->dst.b[4], pcb->dst.b[3], pcb->dst.b[2], pcb->dst.b[1], pcb->dst.b[0], pcb->psm, op->lcid, op->result, op->status, pcb->state); if (pcb->state != NG_BTSOCKET_L2CAP_CONNECTING) { mtx_unlock(&pcb->pcb_mtx); mtx_unlock(&ng_btsocket_l2cap_sockets_mtx); return (ENOENT); } ng_btsocket_l2cap_untimeout(pcb); if (op->result == NG_L2CAP_PENDING) { ng_btsocket_l2cap_timeout(pcb); mtx_unlock(&pcb->pcb_mtx); mtx_unlock(&ng_btsocket_l2cap_sockets_mtx); return (0); } if (op->result == NG_L2CAP_SUCCESS){ if((pcb->idtype == NG_L2CAP_L2CA_IDTYPE_ATT)|| (pcb->idtype == NG_L2CAP_L2CA_IDTYPE_SMP)){ pcb->encryption = op->encryption; pcb->cid = op->lcid; if(pcb->need_encrypt && !(pcb->encryption)){ ng_btsocket_l2cap_timeout(pcb); pcb->state = NG_BTSOCKET_L2CAP_W4_ENC_CHANGE; }else{ pcb->state = NG_BTSOCKET_L2CAP_OPEN; soisconnected(pcb->so); } }else{ /* * Channel is now open, so update local channel ID and * start configuration process. Source and destination * addresses as well as route must be already set. */ pcb->cid = op->lcid; pcb->encryption = op->encryption; error = ng_btsocket_l2cap_send_l2ca_cfg_req(pcb); if (error != 0) { /* Send disconnect request with "zero" token */ ng_btsocket_l2cap_send_l2ca_discon_req(0, pcb); /* ... and close the socket */ pcb->state = NG_BTSOCKET_L2CAP_CLOSED; soisdisconnected(pcb->so); } else { pcb->cfg_state = NG_BTSOCKET_L2CAP_CFG_IN_SENT; pcb->state = NG_BTSOCKET_L2CAP_CONFIGURING; ng_btsocket_l2cap_timeout(pcb); } } } else { /* * We have failed to open connection, so convert result * code to "errno" code and disconnect the socket. Channel * already has been closed. */ pcb->so->so_error = ng_btsocket_l2cap_result2errno(op->result); pcb->state = NG_BTSOCKET_L2CAP_CLOSED; soisdisconnected(pcb->so); } mtx_unlock(&pcb->pcb_mtx); mtx_unlock(&ng_btsocket_l2cap_sockets_mtx); return (error); } /* ng_btsocket_l2cap_process_l2ca_con_req_rsp */ /* * Process L2CA_ConnectRsp response */ static int ng_btsocket_l2cap_process_l2ca_con_rsp_rsp(struct ng_mesg *msg, ng_btsocket_l2cap_rtentry_p rt) { ng_l2cap_l2ca_con_rsp_op *op = NULL; ng_btsocket_l2cap_pcb_t *pcb = NULL; if (msg->header.arglen != sizeof(*op)) return (EMSGSIZE); op = (ng_l2cap_l2ca_con_rsp_op *)(msg->data); mtx_lock(&ng_btsocket_l2cap_sockets_mtx); /* Look for the socket with the token */ pcb = ng_btsocket_l2cap_pcb_by_token(msg->header.token); if (pcb == NULL) { mtx_unlock(&ng_btsocket_l2cap_sockets_mtx); return (ENOENT); } mtx_lock(&pcb->pcb_mtx); NG_BTSOCKET_L2CAP_INFO( "%s: Got L2CA_ConnectRsp response, token=%d, src bdaddr=%x:%x:%x:%x:%x:%x, " \ "dst bdaddr=%x:%x:%x:%x:%x:%x, psm=%d, lcid=%d, result=%d, state=%d\n", __func__, msg->header.token, pcb->src.b[5], pcb->src.b[4], pcb->src.b[3], pcb->src.b[2], pcb->src.b[1], pcb->src.b[0], pcb->dst.b[5], pcb->dst.b[4], pcb->dst.b[3], pcb->dst.b[2], pcb->dst.b[1], pcb->dst.b[0], pcb->psm, pcb->cid, op->result, pcb->state); if (pcb->state != NG_BTSOCKET_L2CAP_CONNECTING) { mtx_unlock(&pcb->pcb_mtx); mtx_unlock(&ng_btsocket_l2cap_sockets_mtx); return (ENOENT); } ng_btsocket_l2cap_untimeout(pcb); /* Check the result and disconnect the socket on failure */ if (op->result != NG_L2CAP_SUCCESS) { /* Close the socket - channel already closed */ pcb->so->so_error = ng_btsocket_l2cap_result2errno(op->result); pcb->state = NG_BTSOCKET_L2CAP_CLOSED; soisdisconnected(pcb->so); } else { /* Move to CONFIGURING state and wait for CONFIG_IND */ pcb->cfg_state = 0; pcb->state = NG_BTSOCKET_L2CAP_CONFIGURING; ng_btsocket_l2cap_timeout(pcb); } mtx_unlock(&pcb->pcb_mtx); mtx_unlock(&ng_btsocket_l2cap_sockets_mtx); return (0); } /* ng_btsocket_process_l2ca_con_rsp_rsp */ /* * Process L2CA_Connect indicator. Find socket that listens on address * and PSM. Find exact or closest match. Create new socket and initiate * connection. */ static int ng_btsocket_l2cap_process_l2ca_con_ind(struct ng_mesg *msg, ng_btsocket_l2cap_rtentry_p rt) { ng_l2cap_l2ca_con_ind_ip *ip = NULL; ng_btsocket_l2cap_pcb_t *pcb = NULL, *pcb1 = NULL; int error = 0; u_int32_t token = 0; u_int16_t result = 0; if (msg->header.arglen != sizeof(*ip)) return (EMSGSIZE); ip = (ng_l2cap_l2ca_con_ind_ip *)(msg->data); NG_BTSOCKET_L2CAP_INFO( "%s: Got L2CA_Connect indicator, src bdaddr=%x:%x:%x:%x:%x:%x, " \ "dst bdaddr=%x:%x:%x:%x:%x:%x, psm=%d, lcid=%d, ident=%d\n", __func__, rt->src.b[5], rt->src.b[4], rt->src.b[3], rt->src.b[2], rt->src.b[1], rt->src.b[0], ip->bdaddr.b[5], ip->bdaddr.b[4], ip->bdaddr.b[3], ip->bdaddr.b[2], ip->bdaddr.b[1], ip->bdaddr.b[0], ip->psm, ip->lcid, ip->ident); mtx_lock(&ng_btsocket_l2cap_sockets_mtx); pcb = ng_btsocket_l2cap_pcb_by_addr(&rt->src, ip->psm); if (pcb != NULL) { struct socket *so1; mtx_lock(&pcb->pcb_mtx); CURVNET_SET(pcb->so->so_vnet); so1 = sonewconn(pcb->so, 0); CURVNET_RESTORE(); if (so1 == NULL) { result = NG_L2CAP_NO_RESOURCES; goto respond; } /* * If we got here than we have created new socket. So complete * connection. If we we listening on specific address then copy * source address from listening socket, otherwise copy source * address from hook's routing information. */ pcb1 = so2l2cap_pcb(so1); KASSERT((pcb1 != NULL), ("%s: pcb1 == NULL\n", __func__)); mtx_lock(&pcb1->pcb_mtx); if (bcmp(&pcb->src, NG_HCI_BDADDR_ANY, sizeof(pcb->src)) != 0) bcopy(&pcb->src, &pcb1->src, sizeof(pcb1->src)); else bcopy(&rt->src, &pcb1->src, sizeof(pcb1->src)); pcb1->flags &= ~NG_BTSOCKET_L2CAP_CLIENT; bcopy(&ip->bdaddr, &pcb1->dst, sizeof(pcb1->dst)); pcb1->psm = ip->psm; pcb1->cid = ip->lcid; pcb1->rt = rt; /* Copy socket settings */ pcb1->imtu = pcb->imtu; bcopy(&pcb->oflow, &pcb1->oflow, sizeof(pcb1->oflow)); pcb1->flush_timo = pcb->flush_timo; token = pcb1->token; } else /* Nobody listens on requested BDADDR/PSM */ result = NG_L2CAP_PSM_NOT_SUPPORTED; respond: error = ng_btsocket_l2cap_send_l2ca_con_rsp_req(token, rt, &ip->bdaddr, ip->ident, ip->lcid, result,ip->linktype); if (pcb1 != NULL) { if (error != 0) { pcb1->so->so_error = error; pcb1->state = NG_BTSOCKET_L2CAP_CLOSED; soisdisconnected(pcb1->so); } else { pcb1->state = NG_BTSOCKET_L2CAP_CONNECTING; soisconnecting(pcb1->so); ng_btsocket_l2cap_timeout(pcb1); } mtx_unlock(&pcb1->pcb_mtx); } if (pcb != NULL) mtx_unlock(&pcb->pcb_mtx); mtx_unlock(&ng_btsocket_l2cap_sockets_mtx); return (error); } /* ng_btsocket_l2cap_process_l2ca_con_ind */ /*Encryption Change*/ static int ng_btsocket_l2cap_process_l2ca_enc_change(struct ng_mesg *msg, ng_btsocket_l2cap_rtentry_p rt) { ng_l2cap_l2ca_enc_chg_op *op = NULL; ng_btsocket_l2cap_pcb_t *pcb = NULL; if (msg->header.arglen != sizeof(*op)) return (EMSGSIZE); op = (ng_l2cap_l2ca_enc_chg_op *)(msg->data); mtx_lock(&ng_btsocket_l2cap_sockets_mtx); pcb = ng_btsocket_l2cap_pcb_by_cid(&rt->src, op->lcid, op->idtype); if (pcb == NULL) { mtx_unlock(&ng_btsocket_l2cap_sockets_mtx); return (ENOENT); } mtx_lock(&pcb->pcb_mtx); pcb->encryption = op->result; if(pcb->need_encrypt){ ng_btsocket_l2cap_untimeout(pcb); if(pcb->state != NG_BTSOCKET_L2CAP_W4_ENC_CHANGE){ NG_BTSOCKET_L2CAP_WARN("%s: Invalid pcb status %d", __func__, pcb->state); }else if(pcb->encryption){ pcb->state = NG_BTSOCKET_L2CAP_OPEN; soisconnected(pcb->so); }else{ pcb->so->so_error = EPERM; ng_btsocket_l2cap_send_l2ca_discon_req(0, pcb); pcb->state = NG_BTSOCKET_L2CAP_CLOSED; soisdisconnected(pcb->so); } } mtx_unlock(&pcb->pcb_mtx); mtx_unlock(&ng_btsocket_l2cap_sockets_mtx); return 0; } /* * Process L2CA_Config response */ static int ng_btsocket_l2cap_process_l2ca_cfg_req_rsp(struct ng_mesg *msg, ng_btsocket_l2cap_rtentry_p rt) { ng_l2cap_l2ca_cfg_op *op = NULL; ng_btsocket_l2cap_pcb_p pcb = NULL; if (msg->header.arglen != sizeof(*op)) return (EMSGSIZE); op = (ng_l2cap_l2ca_cfg_op *)(msg->data); mtx_lock(&ng_btsocket_l2cap_sockets_mtx); /* * Socket must have issued a Configure request, so we must have a * socket that wants to be configured. Use Netgraph message token * to find it */ pcb = ng_btsocket_l2cap_pcb_by_token(msg->header.token); if (pcb == NULL) { /* * XXX FIXME what to do here? We could not find a * socket with requested token. We even can not send * Disconnect, because we do not know channel ID */ mtx_unlock(&ng_btsocket_l2cap_sockets_mtx); return (ENOENT); } mtx_lock(&pcb->pcb_mtx); NG_BTSOCKET_L2CAP_INFO( "%s: Got L2CA_Config response, token=%d, src bdaddr=%x:%x:%x:%x:%x:%x, " \ "dst bdaddr=%x:%x:%x:%x:%x:%x, psm=%d, lcid=%d, result=%d, state=%d, " \ "cfg_state=%x\n", __func__, msg->header.token, pcb->src.b[5], pcb->src.b[4], pcb->src.b[3], pcb->src.b[2], pcb->src.b[1], pcb->src.b[0], pcb->dst.b[5], pcb->dst.b[4], pcb->dst.b[3], pcb->dst.b[2], pcb->dst.b[1], pcb->dst.b[0], pcb->psm, pcb->cid, op->result, pcb->state, pcb->cfg_state); if (pcb->state != NG_BTSOCKET_L2CAP_CONFIGURING) { mtx_unlock(&pcb->pcb_mtx); mtx_unlock(&ng_btsocket_l2cap_sockets_mtx); return (ENOENT); } if (op->result == NG_L2CAP_SUCCESS) { /* * XXX FIXME Actually set flush and link timeout. * Set QoS here if required. Resolve conficts (flush_timo). * Save incoming MTU (peer's outgoing MTU) and outgoing flow * spec. */ pcb->imtu = op->imtu; bcopy(&op->oflow, &pcb->oflow, sizeof(pcb->oflow)); pcb->flush_timo = op->flush_timo; /* * We have configured incoming side, so record it and check * if configuration is complete. If complete then mark socket * as connected, otherwise wait for the peer. */ pcb->cfg_state &= ~NG_BTSOCKET_L2CAP_CFG_IN_SENT; pcb->cfg_state |= NG_BTSOCKET_L2CAP_CFG_IN; if (pcb->cfg_state == NG_BTSOCKET_L2CAP_CFG_BOTH) { /* Configuration complete - mark socket as open */ ng_btsocket_l2cap_untimeout(pcb); pcb->state = NG_BTSOCKET_L2CAP_OPEN; soisconnected(pcb->so); } } else { /* * Something went wrong. Could be unacceptable parameters, * reject or unknown option. That's too bad, but we will * not negotiate. Send Disconnect and close the channel. */ ng_btsocket_l2cap_untimeout(pcb); switch (op->result) { case NG_L2CAP_UNACCEPTABLE_PARAMS: case NG_L2CAP_UNKNOWN_OPTION: pcb->so->so_error = EINVAL; break; default: pcb->so->so_error = ECONNRESET; break; } /* Send disconnect with "zero" token */ ng_btsocket_l2cap_send_l2ca_discon_req(0, pcb); /* ... and close the socket */ pcb->state = NG_BTSOCKET_L2CAP_CLOSED; soisdisconnected(pcb->so); } mtx_unlock(&pcb->pcb_mtx); mtx_unlock(&ng_btsocket_l2cap_sockets_mtx); return (0); } /* ng_btsocket_l2cap_process_l2ca_cfg_req_rsp */ /* * Process L2CA_ConfigRsp response */ static int ng_btsocket_l2cap_process_l2ca_cfg_rsp_rsp(struct ng_mesg *msg, ng_btsocket_l2cap_rtentry_p rt) { ng_l2cap_l2ca_cfg_rsp_op *op = NULL; ng_btsocket_l2cap_pcb_t *pcb = NULL; int error = 0; if (msg->header.arglen != sizeof(*op)) return (EMSGSIZE); op = (ng_l2cap_l2ca_cfg_rsp_op *)(msg->data); mtx_lock(&ng_btsocket_l2cap_sockets_mtx); /* Look for the socket with the token */ pcb = ng_btsocket_l2cap_pcb_by_token(msg->header.token); if (pcb == NULL) { mtx_unlock(&ng_btsocket_l2cap_sockets_mtx); return (ENOENT); } mtx_lock(&pcb->pcb_mtx); NG_BTSOCKET_L2CAP_INFO( "%s: Got L2CA_ConfigRsp response, token=%d, src bdaddr=%x:%x:%x:%x:%x:%x, " \ "dst bdaddr=%x:%x:%x:%x:%x:%x, psm=%d, lcid=%d, result=%d, state=%d, " \ "cfg_state=%x\n", __func__, msg->header.token, pcb->src.b[5], pcb->src.b[4], pcb->src.b[3], pcb->src.b[2], pcb->src.b[1], pcb->src.b[0], pcb->dst.b[5], pcb->dst.b[4], pcb->dst.b[3], pcb->dst.b[2], pcb->dst.b[1], pcb->dst.b[0], pcb->psm, pcb->cid, op->result, pcb->state, pcb->cfg_state); if (pcb->state != NG_BTSOCKET_L2CAP_CONFIGURING) { mtx_unlock(&pcb->pcb_mtx); mtx_unlock(&ng_btsocket_l2cap_sockets_mtx); return (ENOENT); } /* Check the result and disconnect socket of failure */ if (op->result != NG_L2CAP_SUCCESS) goto disconnect; /* * Now we done with remote side configuration. Configure local * side if we have not done it yet. */ pcb->cfg_state &= ~NG_BTSOCKET_L2CAP_CFG_OUT_SENT; pcb->cfg_state |= NG_BTSOCKET_L2CAP_CFG_OUT; if (pcb->cfg_state == NG_BTSOCKET_L2CAP_CFG_BOTH) { /* Configuration complete - mask socket as open */ ng_btsocket_l2cap_untimeout(pcb); pcb->state = NG_BTSOCKET_L2CAP_OPEN; soisconnected(pcb->so); } else { if (!(pcb->cfg_state & NG_BTSOCKET_L2CAP_CFG_IN_SENT)) { /* Send L2CA_Config request - incoming path */ error = ng_btsocket_l2cap_send_l2ca_cfg_req(pcb); if (error != 0) goto disconnect; pcb->cfg_state |= NG_BTSOCKET_L2CAP_CFG_IN_SENT; } } mtx_unlock(&pcb->pcb_mtx); mtx_unlock(&ng_btsocket_l2cap_sockets_mtx); return (error); disconnect: ng_btsocket_l2cap_untimeout(pcb); /* Send disconnect with "zero" token */ ng_btsocket_l2cap_send_l2ca_discon_req(0, pcb); /* ... and close the socket */ pcb->state = NG_BTSOCKET_L2CAP_CLOSED; soisdisconnected(pcb->so); mtx_unlock(&pcb->pcb_mtx); mtx_unlock(&ng_btsocket_l2cap_sockets_mtx); return (error); } /* ng_btsocket_l2cap_process_l2ca_cfg_rsp_rsp */ /* * Process L2CA_Config indicator */ static int ng_btsocket_l2cap_process_l2ca_cfg_ind(struct ng_mesg *msg, ng_btsocket_l2cap_rtentry_p rt) { ng_l2cap_l2ca_cfg_ind_ip *ip = NULL; ng_btsocket_l2cap_pcb_t *pcb = NULL; int error = 0; if (msg->header.arglen != sizeof(*ip)) return (EMSGSIZE); ip = (ng_l2cap_l2ca_cfg_ind_ip *)(msg->data); mtx_lock(&ng_btsocket_l2cap_sockets_mtx); /* Check for the open socket that has given channel ID */ pcb = ng_btsocket_l2cap_pcb_by_cid(&rt->src, ip->lcid, NG_L2CAP_L2CA_IDTYPE_BREDR); if (pcb == NULL) { mtx_unlock(&ng_btsocket_l2cap_sockets_mtx); return (ENOENT); } mtx_lock(&pcb->pcb_mtx); NG_BTSOCKET_L2CAP_INFO( "%s: Got L2CA_Config indicator, src bdaddr=%x:%x:%x:%x:%x:%x, " \ "dst bdaddr=%x:%x:%x:%x:%x:%x, psm=%d, lcid=%d, state=%d, cfg_state=%x\n", __func__, pcb->src.b[5], pcb->src.b[4], pcb->src.b[3], pcb->src.b[2], pcb->src.b[1], pcb->src.b[0], pcb->dst.b[5], pcb->dst.b[4], pcb->dst.b[3], pcb->dst.b[2], pcb->dst.b[1], pcb->dst.b[0], pcb->psm, pcb->cid, pcb->state, pcb->cfg_state); /* XXX FIXME re-configuration on open socket */ if (pcb->state != NG_BTSOCKET_L2CAP_CONFIGURING) { mtx_unlock(&pcb->pcb_mtx); mtx_unlock(&ng_btsocket_l2cap_sockets_mtx); return (ENOENT); } /* * XXX FIXME Actually set flush and link timeout. Set QoS here if * required. Resolve conficts (flush_timo). Note outgoing MTU (peer's * incoming MTU) and incoming flow spec. */ pcb->omtu = ip->omtu; bcopy(&ip->iflow, &pcb->iflow, sizeof(pcb->iflow)); pcb->flush_timo = ip->flush_timo; /* * Send L2CA_Config response to our peer and check for the errors, * if any send disconnect to close the channel. */ if (!(pcb->cfg_state & NG_BTSOCKET_L2CAP_CFG_OUT_SENT)) { error = ng_btsocket_l2cap_send_l2ca_cfg_rsp(pcb); if (error != 0) { ng_btsocket_l2cap_untimeout(pcb); pcb->so->so_error = error; /* Send disconnect with "zero" token */ ng_btsocket_l2cap_send_l2ca_discon_req(0, pcb); /* ... and close the socket */ pcb->state = NG_BTSOCKET_L2CAP_CLOSED; soisdisconnected(pcb->so); } else pcb->cfg_state |= NG_BTSOCKET_L2CAP_CFG_OUT_SENT; } mtx_unlock(&pcb->pcb_mtx); mtx_unlock(&ng_btsocket_l2cap_sockets_mtx); return (error); } /* ng_btsocket_l2cap_process_l2cap_cfg_ind */ /* * Process L2CA_Disconnect response */ static int ng_btsocket_l2cap_process_l2ca_discon_rsp(struct ng_mesg *msg, ng_btsocket_l2cap_rtentry_p rt) { ng_l2cap_l2ca_discon_op *op = NULL; ng_btsocket_l2cap_pcb_t *pcb = NULL; /* Check message */ if (msg->header.arglen != sizeof(*op)) return (EMSGSIZE); op = (ng_l2cap_l2ca_discon_op *)(msg->data); mtx_lock(&ng_btsocket_l2cap_sockets_mtx); /* * Socket layer must have issued L2CA_Disconnect request, so there * must be a socket that wants to be disconnected. Use Netgraph * message token to find it. */ pcb = ng_btsocket_l2cap_pcb_by_token(msg->header.token); if (pcb == NULL) { mtx_unlock(&ng_btsocket_l2cap_sockets_mtx); return (0); } mtx_lock(&pcb->pcb_mtx); /* XXX Close socket no matter what op->result says */ if (pcb->state != NG_BTSOCKET_L2CAP_CLOSED) { NG_BTSOCKET_L2CAP_INFO( "%s: Got L2CA_Disconnect response, token=%d, src bdaddr=%x:%x:%x:%x:%x:%x, " \ "dst bdaddr=%x:%x:%x:%x:%x:%x, psm=%d, lcid=%d, result=%d, state=%d\n", __func__, msg->header.token, pcb->src.b[5], pcb->src.b[4], pcb->src.b[3], pcb->src.b[2], pcb->src.b[1], pcb->src.b[0], pcb->dst.b[5], pcb->dst.b[4], pcb->dst.b[3], pcb->dst.b[2], pcb->dst.b[1], pcb->dst.b[0], pcb->psm, pcb->cid, op->result, pcb->state); ng_btsocket_l2cap_untimeout(pcb); pcb->state = NG_BTSOCKET_L2CAP_CLOSED; soisdisconnected(pcb->so); } mtx_unlock(&pcb->pcb_mtx); mtx_unlock(&ng_btsocket_l2cap_sockets_mtx); return (0); } /* ng_btsocket_l2cap_process_l2ca_discon_rsp */ /* * Process L2CA_Disconnect indicator */ static int ng_btsocket_l2cap_process_l2ca_discon_ind(struct ng_mesg *msg, ng_btsocket_l2cap_rtentry_p rt) { ng_l2cap_l2ca_discon_ind_ip *ip = NULL; ng_btsocket_l2cap_pcb_t *pcb = NULL; /* Check message */ if (msg->header.arglen != sizeof(*ip)) return (EMSGSIZE); ip = (ng_l2cap_l2ca_discon_ind_ip *)(msg->data); mtx_lock(&ng_btsocket_l2cap_sockets_mtx); /* Look for the socket with given channel ID */ pcb = ng_btsocket_l2cap_pcb_by_cid(&rt->src, ip->lcid, ip->idtype); if (pcb == NULL) { mtx_unlock(&ng_btsocket_l2cap_sockets_mtx); return (0); } /* * Channel has already been destroyed, so disconnect the socket * and be done with it. If there was any pending request we can * not do anything here anyway. */ mtx_lock(&pcb->pcb_mtx); NG_BTSOCKET_L2CAP_INFO( "%s: Got L2CA_Disconnect indicator, src bdaddr=%x:%x:%x:%x:%x:%x, " \ "dst bdaddr=%x:%x:%x:%x:%x:%x, psm=%d, lcid=%d, state=%d\n", __func__, pcb->src.b[5], pcb->src.b[4], pcb->src.b[3], pcb->src.b[2], pcb->src.b[1], pcb->src.b[0], pcb->dst.b[5], pcb->dst.b[4], pcb->dst.b[3], pcb->dst.b[2], pcb->dst.b[1], pcb->dst.b[0], pcb->psm, pcb->cid, pcb->state); if (pcb->flags & NG_BTSOCKET_L2CAP_TIMO) ng_btsocket_l2cap_untimeout(pcb); pcb->state = NG_BTSOCKET_L2CAP_CLOSED; soisdisconnected(pcb->so); mtx_unlock(&pcb->pcb_mtx); mtx_unlock(&ng_btsocket_l2cap_sockets_mtx); return (0); } /* ng_btsocket_l2cap_process_l2ca_discon_ind */ /* * Process L2CA_Write response */ static int ng_btsocket_l2cap_process_l2ca_write_rsp(struct ng_mesg *msg, ng_btsocket_l2cap_rtentry_p rt) { ng_l2cap_l2ca_write_op *op = NULL; ng_btsocket_l2cap_pcb_t *pcb = NULL; /* Check message */ if (msg->header.arglen != sizeof(*op)) return (EMSGSIZE); op = (ng_l2cap_l2ca_write_op *)(msg->data); mtx_lock(&ng_btsocket_l2cap_sockets_mtx); /* Look for the socket with given token */ pcb = ng_btsocket_l2cap_pcb_by_token(msg->header.token); if (pcb == NULL) { mtx_unlock(&ng_btsocket_l2cap_sockets_mtx); return (ENOENT); } mtx_lock(&pcb->pcb_mtx); NG_BTSOCKET_L2CAP_INFO( "%s: Got L2CA_Write response, src bdaddr=%x:%x:%x:%x:%x:%x, " \ "dst bdaddr=%x:%x:%x:%x:%x:%x, psm=%d, lcid=%d, result=%d, length=%d, " \ "state=%d\n", __func__, pcb->src.b[5], pcb->src.b[4], pcb->src.b[3], pcb->src.b[2], pcb->src.b[1], pcb->src.b[0], pcb->dst.b[5], pcb->dst.b[4], pcb->dst.b[3], pcb->dst.b[2], pcb->dst.b[1], pcb->dst.b[0], pcb->psm, pcb->cid, op->result, op->length, pcb->state); if (pcb->state != NG_BTSOCKET_L2CAP_OPEN) { mtx_unlock(&pcb->pcb_mtx); mtx_unlock(&ng_btsocket_l2cap_sockets_mtx); return (ENOENT); } ng_btsocket_l2cap_untimeout(pcb); /* * Check if we have more data to send */ sbdroprecord(&pcb->so->so_snd); if (sbavail(&pcb->so->so_snd) > 0) { if (ng_btsocket_l2cap_send2(pcb) == 0) ng_btsocket_l2cap_timeout(pcb); else sbdroprecord(&pcb->so->so_snd); /* XXX */ } /* * Now set the result, drop packet from the socket send queue and * ask for more (wakeup sender) */ pcb->so->so_error = ng_btsocket_l2cap_result2errno(op->result); sowwakeup(pcb->so); mtx_unlock(&pcb->pcb_mtx); mtx_unlock(&ng_btsocket_l2cap_sockets_mtx); return (0); } /* ng_btsocket_l2cap_process_l2ca_write_rsp */ /* * Send L2CA_Connect request */ static int ng_btsocket_l2cap_send_l2ca_con_req(ng_btsocket_l2cap_pcb_p pcb) { struct ng_mesg *msg = NULL; ng_l2cap_l2ca_con_ip *ip = NULL; int error = 0; mtx_assert(&pcb->pcb_mtx, MA_OWNED); if (pcb->rt == NULL || pcb->rt->hook == NULL || NG_HOOK_NOT_VALID(pcb->rt->hook)) return (ENETDOWN); NG_MKMESSAGE(msg, NGM_L2CAP_COOKIE, NGM_L2CAP_L2CA_CON, sizeof(*ip), M_NOWAIT); if (msg == NULL) return (ENOMEM); msg->header.token = pcb->token; ip = (ng_l2cap_l2ca_con_ip *)(msg->data); bcopy(&pcb->dst, &ip->bdaddr, sizeof(ip->bdaddr)); ip->psm = pcb->psm; ip->linktype = ng_btsock_l2cap_addrtype_to_linktype(pcb->dsttype); ip->idtype = pcb->idtype; NG_SEND_MSG_HOOK(error, ng_btsocket_l2cap_node, msg,pcb->rt->hook, 0); return (error); } /* ng_btsocket_l2cap_send_l2ca_con_req */ /* * Send L2CA_Connect response */ static int ng_btsocket_l2cap_send_l2ca_con_rsp_req(u_int32_t token, ng_btsocket_l2cap_rtentry_p rt, bdaddr_p dst, int ident, int lcid, int result, int linktype) { struct ng_mesg *msg = NULL; ng_l2cap_l2ca_con_rsp_ip *ip = NULL; int error = 0; if (rt == NULL || rt->hook == NULL || NG_HOOK_NOT_VALID(rt->hook)) return (ENETDOWN); NG_MKMESSAGE(msg, NGM_L2CAP_COOKIE, NGM_L2CAP_L2CA_CON_RSP, sizeof(*ip), M_NOWAIT); if (msg == NULL) return (ENOMEM); msg->header.token = token; ip = (ng_l2cap_l2ca_con_rsp_ip *)(msg->data); bcopy(dst, &ip->bdaddr, sizeof(ip->bdaddr)); ip->ident = ident; ip->lcid = lcid; ip->linktype = linktype; ip->result = result; ip->status = 0; NG_SEND_MSG_HOOK(error, ng_btsocket_l2cap_node, msg, rt->hook, 0); return (error); } /* ng_btsocket_l2cap_send_l2ca_con_rsp_req */ /* * Send L2CA_Config request */ static int ng_btsocket_l2cap_send_l2ca_cfg_req(ng_btsocket_l2cap_pcb_p pcb) { struct ng_mesg *msg = NULL; ng_l2cap_l2ca_cfg_ip *ip = NULL; int error = 0; mtx_assert(&pcb->pcb_mtx, MA_OWNED); if (pcb->rt == NULL || pcb->rt->hook == NULL || NG_HOOK_NOT_VALID(pcb->rt->hook)) return (ENETDOWN); NG_MKMESSAGE(msg, NGM_L2CAP_COOKIE, NGM_L2CAP_L2CA_CFG, sizeof(*ip), M_NOWAIT); if (msg == NULL) return (ENOMEM); msg->header.token = pcb->token; ip = (ng_l2cap_l2ca_cfg_ip *)(msg->data); ip->lcid = pcb->cid; ip->imtu = pcb->imtu; bcopy(&pcb->oflow, &ip->oflow, sizeof(ip->oflow)); ip->flush_timo = pcb->flush_timo; ip->link_timo = pcb->link_timo; NG_SEND_MSG_HOOK(error, ng_btsocket_l2cap_node, msg,pcb->rt->hook, 0); return (error); } /* ng_btsocket_l2cap_send_l2ca_cfg_req */ /* * Send L2CA_Config response */ static int ng_btsocket_l2cap_send_l2ca_cfg_rsp(ng_btsocket_l2cap_pcb_p pcb) { struct ng_mesg *msg = NULL; ng_l2cap_l2ca_cfg_rsp_ip *ip = NULL; int error = 0; mtx_assert(&pcb->pcb_mtx, MA_OWNED); if (pcb->rt == NULL || pcb->rt->hook == NULL || NG_HOOK_NOT_VALID(pcb->rt->hook)) return (ENETDOWN); NG_MKMESSAGE(msg, NGM_L2CAP_COOKIE, NGM_L2CAP_L2CA_CFG_RSP, sizeof(*ip), M_NOWAIT); if (msg == NULL) return (ENOMEM); msg->header.token = pcb->token; ip = (ng_l2cap_l2ca_cfg_rsp_ip *)(msg->data); ip->lcid = pcb->cid; ip->omtu = pcb->omtu; bcopy(&pcb->iflow, &ip->iflow, sizeof(ip->iflow)); NG_SEND_MSG_HOOK(error, ng_btsocket_l2cap_node, msg, pcb->rt->hook, 0); return (error); } /* ng_btsocket_l2cap_send_l2ca_cfg_rsp */ /* * Send L2CA_Disconnect request */ static int ng_btsocket_l2cap_send_l2ca_discon_req(u_int32_t token, ng_btsocket_l2cap_pcb_p pcb) { struct ng_mesg *msg = NULL; ng_l2cap_l2ca_discon_ip *ip = NULL; int error = 0; mtx_assert(&pcb->pcb_mtx, MA_OWNED); if (pcb->rt == NULL || pcb->rt->hook == NULL || NG_HOOK_NOT_VALID(pcb->rt->hook)) return (ENETDOWN); NG_MKMESSAGE(msg, NGM_L2CAP_COOKIE, NGM_L2CAP_L2CA_DISCON, sizeof(*ip), M_NOWAIT); if (msg == NULL) return (ENOMEM); msg->header.token = token; ip = (ng_l2cap_l2ca_discon_ip *)(msg->data); ip->lcid = pcb->cid; ip->idtype = pcb->idtype; NG_SEND_MSG_HOOK(error, ng_btsocket_l2cap_node, msg,pcb->rt->hook, 0); return (error); } /* ng_btsocket_l2cap_send_l2ca_discon_req */ /***************************************************************************** ***************************************************************************** ** Socket interface ***************************************************************************** *****************************************************************************/ /* * L2CAP sockets data input routine */ static void ng_btsocket_l2cap_data_input(struct mbuf *m, hook_p hook) { ng_l2cap_hdr_t *hdr = NULL; ng_l2cap_clt_hdr_t *clt_hdr = NULL; ng_btsocket_l2cap_pcb_t *pcb = NULL; ng_btsocket_l2cap_rtentry_t *rt = NULL; uint16_t idtype; if (hook == NULL) { NG_BTSOCKET_L2CAP_ALERT( "%s: Invalid source hook for L2CAP data packet\n", __func__); goto drop; } rt = (ng_btsocket_l2cap_rtentry_t *) NG_HOOK_PRIVATE(hook); if (rt == NULL) { NG_BTSOCKET_L2CAP_ALERT( "%s: Could not find out source bdaddr for L2CAP data packet\n", __func__); goto drop; } m = m_pullup(m, sizeof(uint16_t)); idtype = *mtod(m, uint16_t *); m_adj(m, sizeof(uint16_t)); /* Make sure we can access header */ if (m->m_pkthdr.len < sizeof(*hdr)) { NG_BTSOCKET_L2CAP_ERR( "%s: L2CAP data packet too small, len=%d\n", __func__, m->m_pkthdr.len); goto drop; } if (m->m_len < sizeof(*hdr)) { m = m_pullup(m, sizeof(*hdr)); if (m == NULL) goto drop; } /* Strip L2CAP packet header and verify packet length */ hdr = mtod(m, ng_l2cap_hdr_t *); m_adj(m, sizeof(*hdr)); if (hdr->length != m->m_pkthdr.len) { NG_BTSOCKET_L2CAP_ERR( "%s: Bad L2CAP data packet length, len=%d, length=%d\n", __func__, m->m_pkthdr.len, hdr->length); goto drop; } /* * Now process packet. Two cases: * * 1) Normal packet (cid != 2) then find connected socket and append * mbuf to the socket queue. Wakeup socket. * * 2) Broadcast packet (cid == 2) then find all sockets that connected * to the given PSM and have SO_BROADCAST bit set and append mbuf * to the socket queue. Wakeup socket. */ NG_BTSOCKET_L2CAP_INFO( "%s: Received L2CAP data packet: src bdaddr=%x:%x:%x:%x:%x:%x, " \ "dcid=%d, length=%d\n", __func__, rt->src.b[5], rt->src.b[4], rt->src.b[3], rt->src.b[2], rt->src.b[1], rt->src.b[0], hdr->dcid, hdr->length); if ((hdr->dcid >= NG_L2CAP_FIRST_CID) || (idtype == NG_L2CAP_L2CA_IDTYPE_ATT)|| (idtype == NG_L2CAP_L2CA_IDTYPE_SMP) ){ mtx_lock(&ng_btsocket_l2cap_sockets_mtx); /* Normal packet: find connected socket */ pcb = ng_btsocket_l2cap_pcb_by_cid(&rt->src, hdr->dcid,idtype); if (pcb == NULL) { mtx_unlock(&ng_btsocket_l2cap_sockets_mtx); goto drop; } mtx_lock(&pcb->pcb_mtx); if (pcb->state != NG_BTSOCKET_L2CAP_OPEN) { NG_BTSOCKET_L2CAP_ERR( "%s: No connected socket found, src bdaddr=%x:%x:%x:%x:%x:%x, dcid=%d, " \ "state=%d\n", __func__, rt->src.b[5], rt->src.b[4], rt->src.b[3], rt->src.b[2], rt->src.b[1], rt->src.b[0], hdr->dcid, pcb->state); mtx_unlock(&pcb->pcb_mtx); mtx_unlock(&ng_btsocket_l2cap_sockets_mtx); goto drop; } /* Check packet size against socket's incoming MTU */ if (hdr->length > pcb->imtu) { NG_BTSOCKET_L2CAP_ERR( "%s: L2CAP data packet too big, src bdaddr=%x:%x:%x:%x:%x:%x, " \ "dcid=%d, length=%d, imtu=%d\n", __func__, rt->src.b[5], rt->src.b[4], rt->src.b[3], rt->src.b[2], rt->src.b[1], rt->src.b[0], hdr->dcid, hdr->length, pcb->imtu); mtx_unlock(&pcb->pcb_mtx); mtx_unlock(&ng_btsocket_l2cap_sockets_mtx); goto drop; } /* Check if we have enough space in socket receive queue */ if (m->m_pkthdr.len > sbspace(&pcb->so->so_rcv)) { /* * This is really bad. Receive queue on socket does * not have enough space for the packet. We do not * have any other choice but drop the packet. L2CAP * does not provide any flow control. */ NG_BTSOCKET_L2CAP_ERR( "%s: Not enough space in socket receive queue. Dropping L2CAP data packet, " \ "src bdaddr=%x:%x:%x:%x:%x:%x, dcid=%d, len=%d, space=%ld\n", __func__, rt->src.b[5], rt->src.b[4], rt->src.b[3], rt->src.b[2], rt->src.b[1], rt->src.b[0], hdr->dcid, m->m_pkthdr.len, sbspace(&pcb->so->so_rcv)); mtx_unlock(&pcb->pcb_mtx); mtx_unlock(&ng_btsocket_l2cap_sockets_mtx); goto drop; } /* Append packet to the socket receive queue and wakeup */ sbappendrecord(&pcb->so->so_rcv, m); m = NULL; sorwakeup(pcb->so); mtx_unlock(&pcb->pcb_mtx); mtx_unlock(&ng_btsocket_l2cap_sockets_mtx); } else if (hdr->dcid == NG_L2CAP_CLT_CID) { /* Broadcast packet: give packet to all sockets */ /* Check packet size against connectionless MTU */ if (hdr->length > NG_L2CAP_MTU_DEFAULT) { NG_BTSOCKET_L2CAP_ERR( "%s: Connectionless L2CAP data packet too big, " \ "src bdaddr=%x:%x:%x:%x:%x:%x, length=%d\n", __func__, rt->src.b[5], rt->src.b[4], rt->src.b[3], rt->src.b[2], rt->src.b[1], rt->src.b[0], hdr->length); goto drop; } /* Make sure we can access connectionless header */ if (m->m_pkthdr.len < sizeof(*clt_hdr)) { NG_BTSOCKET_L2CAP_ERR( "%s: Can not get L2CAP connectionless packet header, " \ "src bdaddr=%x:%x:%x:%x:%x:%x, length=%d\n", __func__, rt->src.b[5], rt->src.b[4], rt->src.b[3], rt->src.b[2], rt->src.b[1], rt->src.b[0], hdr->length); goto drop; } if (m->m_len < sizeof(*clt_hdr)) { m = m_pullup(m, sizeof(*clt_hdr)); if (m == NULL) goto drop; } /* Strip connectionless header and deliver packet */ clt_hdr = mtod(m, ng_l2cap_clt_hdr_t *); m_adj(m, sizeof(*clt_hdr)); NG_BTSOCKET_L2CAP_INFO( "%s: Got L2CAP connectionless data packet, " \ "src bdaddr=%x:%x:%x:%x:%x:%x, psm=%d, length=%d\n", __func__, rt->src.b[5], rt->src.b[4], rt->src.b[3], rt->src.b[2], rt->src.b[1], rt->src.b[0], clt_hdr->psm, hdr->length); mtx_lock(&ng_btsocket_l2cap_sockets_mtx); LIST_FOREACH(pcb, &ng_btsocket_l2cap_sockets, next) { struct mbuf *copy = NULL; mtx_lock(&pcb->pcb_mtx); if (bcmp(&rt->src, &pcb->src, sizeof(pcb->src)) != 0 || pcb->psm != clt_hdr->psm || pcb->state != NG_BTSOCKET_L2CAP_OPEN || (pcb->so->so_options & SO_BROADCAST) == 0 || m->m_pkthdr.len > sbspace(&pcb->so->so_rcv)) goto next; /* * Create a copy of the packet and append it to the * socket's queue. If m_dup() failed - no big deal * it is a broadcast traffic after all */ copy = m_dup(m, M_NOWAIT); if (copy != NULL) { sbappendrecord(&pcb->so->so_rcv, copy); sorwakeup(pcb->so); } next: mtx_unlock(&pcb->pcb_mtx); } mtx_unlock(&ng_btsocket_l2cap_sockets_mtx); } drop: NG_FREE_M(m); /* checks for m != NULL */ } /* ng_btsocket_l2cap_data_input */ /* * L2CAP sockets default message input routine */ static void ng_btsocket_l2cap_default_msg_input(struct ng_mesg *msg, hook_p hook) { switch (msg->header.cmd) { case NGM_L2CAP_NODE_HOOK_INFO: { ng_btsocket_l2cap_rtentry_t *rt = NULL; ng_l2cap_node_hook_info_ep *ep = (ng_l2cap_node_hook_info_ep *)msg->data; if (hook == NULL || msg->header.arglen != sizeof(*ep)) break; if (bcmp(&ep->addr, NG_HCI_BDADDR_ANY, sizeof(bdaddr_t)) == 0) break; mtx_lock(&ng_btsocket_l2cap_rt_mtx); rt = (ng_btsocket_l2cap_rtentry_t *) NG_HOOK_PRIVATE(hook); if (rt == NULL) { rt = malloc(sizeof(*rt), M_NETGRAPH_BTSOCKET_L2CAP, M_NOWAIT|M_ZERO); if (rt == NULL) { mtx_unlock(&ng_btsocket_l2cap_rt_mtx); break; } LIST_INSERT_HEAD(&ng_btsocket_l2cap_rt, rt, next); NG_HOOK_SET_PRIVATE(hook, rt); } bcopy(&ep->addr, &rt->src, sizeof(rt->src)); rt->hook = hook; mtx_unlock(&ng_btsocket_l2cap_rt_mtx); NG_BTSOCKET_L2CAP_INFO( "%s: Updating hook \"%s\", src bdaddr=%x:%x:%x:%x:%x:%x\n", __func__, NG_HOOK_NAME(hook), rt->src.b[5], rt->src.b[4], rt->src.b[3], rt->src.b[2], rt->src.b[1], rt->src.b[0]); } break; default: NG_BTSOCKET_L2CAP_WARN( "%s: Unknown message, cmd=%d\n", __func__, msg->header.cmd); break; } NG_FREE_MSG(msg); /* Checks for msg != NULL */ } /* ng_btsocket_l2cap_default_msg_input */ /* * L2CAP sockets L2CA message input routine */ static void ng_btsocket_l2cap_l2ca_msg_input(struct ng_mesg *msg, hook_p hook) { ng_btsocket_l2cap_rtentry_p rt = NULL; if (hook == NULL) { NG_BTSOCKET_L2CAP_ALERT( "%s: Invalid source hook for L2CA message\n", __func__); goto drop; } rt = (ng_btsocket_l2cap_rtentry_p) NG_HOOK_PRIVATE(hook); if (rt == NULL) { NG_BTSOCKET_L2CAP_ALERT( "%s: Could not find out source bdaddr for L2CA message\n", __func__); goto drop; } switch (msg->header.cmd) { case NGM_L2CAP_L2CA_CON: /* L2CA_Connect response */ ng_btsocket_l2cap_process_l2ca_con_req_rsp(msg, rt); break; case NGM_L2CAP_L2CA_CON_RSP: /* L2CA_ConnectRsp response */ ng_btsocket_l2cap_process_l2ca_con_rsp_rsp(msg, rt); break; case NGM_L2CAP_L2CA_CON_IND: /* L2CA_Connect indicator */ ng_btsocket_l2cap_process_l2ca_con_ind(msg, rt); break; case NGM_L2CAP_L2CA_CFG: /* L2CA_Config response */ ng_btsocket_l2cap_process_l2ca_cfg_req_rsp(msg, rt); break; case NGM_L2CAP_L2CA_CFG_RSP: /* L2CA_ConfigRsp response */ ng_btsocket_l2cap_process_l2ca_cfg_rsp_rsp(msg, rt); break; case NGM_L2CAP_L2CA_CFG_IND: /* L2CA_Config indicator */ ng_btsocket_l2cap_process_l2ca_cfg_ind(msg, rt); break; case NGM_L2CAP_L2CA_DISCON: /* L2CA_Disconnect response */ ng_btsocket_l2cap_process_l2ca_discon_rsp(msg, rt); break; case NGM_L2CAP_L2CA_DISCON_IND: /* L2CA_Disconnect indicator */ ng_btsocket_l2cap_process_l2ca_discon_ind(msg, rt); break; case NGM_L2CAP_L2CA_WRITE: /* L2CA_Write response */ ng_btsocket_l2cap_process_l2ca_write_rsp(msg, rt); break; case NGM_L2CAP_L2CA_ENC_CHANGE: ng_btsocket_l2cap_process_l2ca_enc_change(msg, rt); break; /* XXX FIXME add other L2CA messages */ default: NG_BTSOCKET_L2CAP_WARN( "%s: Unknown L2CA message, cmd=%d\n", __func__, msg->header.cmd); break; } drop: NG_FREE_MSG(msg); } /* ng_btsocket_l2cap_l2ca_msg_input */ /* * L2CAP sockets input routine */ static void ng_btsocket_l2cap_input(void *context, int pending) { item_p item = NULL; hook_p hook = NULL; for (;;) { mtx_lock(&ng_btsocket_l2cap_queue_mtx); NG_BT_ITEMQ_DEQUEUE(&ng_btsocket_l2cap_queue, item); mtx_unlock(&ng_btsocket_l2cap_queue_mtx); if (item == NULL) break; NGI_GET_HOOK(item, hook); if (hook != NULL && NG_HOOK_NOT_VALID(hook)) goto drop; switch(item->el_flags & NGQF_TYPE) { case NGQF_DATA: { struct mbuf *m = NULL; NGI_GET_M(item, m); ng_btsocket_l2cap_data_input(m, hook); } break; case NGQF_MESG: { struct ng_mesg *msg = NULL; NGI_GET_MSG(item, msg); switch (msg->header.cmd) { case NGM_L2CAP_L2CA_CON: case NGM_L2CAP_L2CA_CON_RSP: case NGM_L2CAP_L2CA_CON_IND: case NGM_L2CAP_L2CA_CFG: case NGM_L2CAP_L2CA_CFG_RSP: case NGM_L2CAP_L2CA_CFG_IND: case NGM_L2CAP_L2CA_DISCON: case NGM_L2CAP_L2CA_DISCON_IND: case NGM_L2CAP_L2CA_WRITE: case NGM_L2CAP_L2CA_ENC_CHANGE: /* XXX FIXME add other L2CA messages */ ng_btsocket_l2cap_l2ca_msg_input(msg, hook); break; default: ng_btsocket_l2cap_default_msg_input(msg, hook); break; } } break; default: KASSERT(0, ("%s: invalid item type=%ld\n", __func__, (item->el_flags & NGQF_TYPE))); break; } drop: if (hook != NULL) NG_HOOK_UNREF(hook); NG_FREE_ITEM(item); } } /* ng_btsocket_l2cap_input */ /* * Route cleanup task. Gets scheduled when hook is disconnected. Here we * will find all sockets that use "invalid" hook and disconnect them. */ static void ng_btsocket_l2cap_rtclean(void *context, int pending) { ng_btsocket_l2cap_pcb_p pcb = NULL, pcb_next = NULL; ng_btsocket_l2cap_rtentry_p rt = NULL; mtx_lock(&ng_btsocket_l2cap_rt_mtx); mtx_lock(&ng_btsocket_l2cap_sockets_mtx); /* * First disconnect all sockets that use "invalid" hook */ for (pcb = LIST_FIRST(&ng_btsocket_l2cap_sockets); pcb != NULL; ) { mtx_lock(&pcb->pcb_mtx); pcb_next = LIST_NEXT(pcb, next); if (pcb->rt != NULL && pcb->rt->hook != NULL && NG_HOOK_NOT_VALID(pcb->rt->hook)) { if (pcb->flags & NG_BTSOCKET_L2CAP_TIMO) ng_btsocket_l2cap_untimeout(pcb); pcb->so->so_error = ENETDOWN; pcb->state = NG_BTSOCKET_L2CAP_CLOSED; soisdisconnected(pcb->so); pcb->token = 0; pcb->cid = 0; pcb->rt = NULL; } mtx_unlock(&pcb->pcb_mtx); pcb = pcb_next; } /* * Now cleanup routing table */ for (rt = LIST_FIRST(&ng_btsocket_l2cap_rt); rt != NULL; ) { ng_btsocket_l2cap_rtentry_p rt_next = LIST_NEXT(rt, next); if (rt->hook != NULL && NG_HOOK_NOT_VALID(rt->hook)) { LIST_REMOVE(rt, next); NG_HOOK_SET_PRIVATE(rt->hook, NULL); NG_HOOK_UNREF(rt->hook); /* Remove extra reference */ bzero(rt, sizeof(*rt)); free(rt, M_NETGRAPH_BTSOCKET_L2CAP); } rt = rt_next; } mtx_unlock(&ng_btsocket_l2cap_sockets_mtx); mtx_unlock(&ng_btsocket_l2cap_rt_mtx); } /* ng_btsocket_l2cap_rtclean */ /* * Initialize everything */ void ng_btsocket_l2cap_init(void) { int error = 0; /* Skip initialization of globals for non-default instances. */ if (!IS_DEFAULT_VNET(curvnet)) return; ng_btsocket_l2cap_node = NULL; ng_btsocket_l2cap_debug_level = NG_BTSOCKET_WARN_LEVEL; /* Register Netgraph node type */ error = ng_newtype(&typestruct); if (error != 0) { NG_BTSOCKET_L2CAP_ALERT( "%s: Could not register Netgraph node type, error=%d\n", __func__, error); return; } /* Create Netgrapg node */ error = ng_make_node_common(&typestruct, &ng_btsocket_l2cap_node); if (error != 0) { NG_BTSOCKET_L2CAP_ALERT( "%s: Could not create Netgraph node, error=%d\n", __func__, error); ng_btsocket_l2cap_node = NULL; return; } error = ng_name_node(ng_btsocket_l2cap_node, NG_BTSOCKET_L2CAP_NODE_TYPE); if (error != 0) { NG_BTSOCKET_L2CAP_ALERT( "%s: Could not name Netgraph node, error=%d\n", __func__, error); NG_NODE_UNREF(ng_btsocket_l2cap_node); ng_btsocket_l2cap_node = NULL; return; } /* Create input queue */ NG_BT_ITEMQ_INIT(&ng_btsocket_l2cap_queue, ifqmaxlen); mtx_init(&ng_btsocket_l2cap_queue_mtx, "btsocks_l2cap_queue_mtx", NULL, MTX_DEF); TASK_INIT(&ng_btsocket_l2cap_queue_task, 0, ng_btsocket_l2cap_input, NULL); /* Create list of sockets */ LIST_INIT(&ng_btsocket_l2cap_sockets); mtx_init(&ng_btsocket_l2cap_sockets_mtx, "btsocks_l2cap_sockets_mtx", NULL, MTX_DEF); /* Routing table */ LIST_INIT(&ng_btsocket_l2cap_rt); mtx_init(&ng_btsocket_l2cap_rt_mtx, "btsocks_l2cap_rt_mtx", NULL, MTX_DEF); TASK_INIT(&ng_btsocket_l2cap_rt_task, 0, ng_btsocket_l2cap_rtclean, NULL); } /* ng_btsocket_l2cap_init */ /* * Abort connection on socket */ void ng_btsocket_l2cap_abort(struct socket *so) { so->so_error = ECONNABORTED; (void)ng_btsocket_l2cap_disconnect(so); } /* ng_btsocket_l2cap_abort */ void ng_btsocket_l2cap_close(struct socket *so) { (void)ng_btsocket_l2cap_disconnect(so); } /* ng_btsocket_l2cap_close */ /* * Accept connection on socket. Nothing to do here, socket must be connected * and ready, so just return peer address and be done with it. */ int ng_btsocket_l2cap_accept(struct socket *so, struct sockaddr **nam) { if (ng_btsocket_l2cap_node == NULL) return (EINVAL); return (ng_btsocket_l2cap_peeraddr(so, nam)); } /* ng_btsocket_l2cap_accept */ /* * Create and attach new socket */ int ng_btsocket_l2cap_attach(struct socket *so, int proto, struct thread *td) { static u_int32_t token = 0; ng_btsocket_l2cap_pcb_p pcb = so2l2cap_pcb(so); int error; /* Check socket and protocol */ if (ng_btsocket_l2cap_node == NULL) return (EPROTONOSUPPORT); if (so->so_type != SOCK_SEQPACKET) return (ESOCKTNOSUPPORT); #if 0 /* XXX sonewconn() calls "pru_attach" with proto == 0 */ if (proto != 0) if (proto != BLUETOOTH_PROTO_L2CAP) return (EPROTONOSUPPORT); #endif /* XXX */ if (pcb != NULL) return (EISCONN); /* Reserve send and receive space if it is not reserved yet */ if ((so->so_snd.sb_hiwat == 0) || (so->so_rcv.sb_hiwat == 0)) { error = soreserve(so, NG_BTSOCKET_L2CAP_SENDSPACE, NG_BTSOCKET_L2CAP_RECVSPACE); if (error != 0) return (error); } /* Allocate the PCB */ pcb = malloc(sizeof(*pcb), M_NETGRAPH_BTSOCKET_L2CAP, M_NOWAIT | M_ZERO); if (pcb == NULL) return (ENOMEM); /* Link the PCB and the socket */ so->so_pcb = (caddr_t) pcb; pcb->so = so; pcb->state = NG_BTSOCKET_L2CAP_CLOSED; /* Initialize PCB */ pcb->imtu = pcb->omtu = NG_L2CAP_MTU_DEFAULT; /* Default flow */ pcb->iflow.flags = 0x0; pcb->iflow.service_type = NG_HCI_SERVICE_TYPE_BEST_EFFORT; pcb->iflow.token_rate = 0xffffffff; /* maximum */ pcb->iflow.token_bucket_size = 0xffffffff; /* maximum */ pcb->iflow.peak_bandwidth = 0x00000000; /* maximum */ pcb->iflow.latency = 0xffffffff; /* don't care */ pcb->iflow.delay_variation = 0xffffffff; /* don't care */ bcopy(&pcb->iflow, &pcb->oflow, sizeof(pcb->oflow)); pcb->flush_timo = NG_L2CAP_FLUSH_TIMO_DEFAULT; pcb->link_timo = NG_L2CAP_LINK_TIMO_DEFAULT; /* * XXX Mark PCB mutex as DUPOK to prevent "duplicated lock of * the same type" message. When accepting new L2CAP connection * ng_btsocket_l2cap_process_l2ca_con_ind() holds both PCB mutexes * for "old" (accepting) PCB and "new" (created) PCB. */ mtx_init(&pcb->pcb_mtx, "btsocks_l2cap_pcb_mtx", NULL, MTX_DEF|MTX_DUPOK); callout_init_mtx(&pcb->timo, &pcb->pcb_mtx, 0); /* * Add the PCB to the list * * XXX FIXME VERY IMPORTANT! * * This is totally FUBAR. We could get here in two cases: * * 1) When user calls socket() * 2) When we need to accept new incoming connection and call * sonewconn() * * In the first case we must acquire ng_btsocket_l2cap_sockets_mtx. * In the second case we hold ng_btsocket_l2cap_sockets_mtx already. * So we now need to distinguish between these cases. From reading * /sys/kern/uipc_socket.c we can find out that sonewconn() calls * pru_attach with proto == 0 and td == NULL. For now use this fact * to figure out if we were called from socket() or from sonewconn(). */ if (td != NULL) mtx_lock(&ng_btsocket_l2cap_sockets_mtx); else mtx_assert(&ng_btsocket_l2cap_sockets_mtx, MA_OWNED); /* Set PCB token. Use ng_btsocket_l2cap_sockets_mtx for protection */ if (++ token == 0) token ++; pcb->token = token; LIST_INSERT_HEAD(&ng_btsocket_l2cap_sockets, pcb, next); if (td != NULL) mtx_unlock(&ng_btsocket_l2cap_sockets_mtx); return (0); } /* ng_btsocket_l2cap_attach */ /* * Bind socket */ int ng_btsocket_l2cap_bind(struct socket *so, struct sockaddr *nam, struct thread *td) { ng_btsocket_l2cap_pcb_t *pcb = NULL; struct sockaddr_l2cap *sa = (struct sockaddr_l2cap *) nam; int psm, error = 0; if (ng_btsocket_l2cap_node == NULL) return (EINVAL); /* Verify address */ if (sa == NULL) return (EINVAL); if (sa->l2cap_family != AF_BLUETOOTH) return (EAFNOSUPPORT); /*For the time being, Not support LE binding.*/ if ((sa->l2cap_len != sizeof(*sa))&& (sa->l2cap_len != sizeof(struct sockaddr_l2cap_compat))) return (EINVAL); psm = le16toh(sa->l2cap_psm); /* * Check if other socket has this address already (look for exact * match PSM and bdaddr) and assign socket address if it's available. * * Note: socket can be bound to ANY PSM (zero) thus allowing several * channels with the same PSM between the same pair of BD_ADDR'es. */ mtx_lock(&ng_btsocket_l2cap_sockets_mtx); LIST_FOREACH(pcb, &ng_btsocket_l2cap_sockets, next) if (psm != 0 && psm == pcb->psm && bcmp(&pcb->src, &sa->l2cap_bdaddr, sizeof(bdaddr_t)) == 0) break; if (pcb == NULL) { /* Set socket address */ pcb = so2l2cap_pcb(so); if (pcb != NULL) { bcopy(&sa->l2cap_bdaddr, &pcb->src, sizeof(pcb->src)); pcb->psm = psm; } else error = EINVAL; } else error = EADDRINUSE; mtx_unlock(&ng_btsocket_l2cap_sockets_mtx); return (error); } /* ng_btsocket_l2cap_bind */ /* * Connect socket */ int ng_btsocket_l2cap_connect(struct socket *so, struct sockaddr *nam, struct thread *td) { ng_btsocket_l2cap_pcb_t *pcb = so2l2cap_pcb(so); struct sockaddr_l2cap_compat *sal = (struct sockaddr_l2cap_compat *) nam; struct sockaddr_l2cap *sa = (struct sockaddr_l2cap *)nam; struct sockaddr_l2cap ba; ng_btsocket_l2cap_rtentry_t *rt = NULL; int have_src, error = 0; int idtype = NG_L2CAP_L2CA_IDTYPE_BREDR; /* Check socket */ if (pcb == NULL) return (EINVAL); if (ng_btsocket_l2cap_node == NULL) return (EINVAL); if (pcb->state == NG_BTSOCKET_L2CAP_CONNECTING) return (EINPROGRESS); /* Verify address */ if (sa == NULL) return (EINVAL); if (sa->l2cap_family != AF_BLUETOOTH) return (EAFNOSUPPORT); if (sa->l2cap_len == sizeof(*sal)){ bcopy(sal, &ba, sizeof(*sal)); sa = &ba; sa->l2cap_len = sizeof(*sa); sa->l2cap_bdaddr_type = BDADDR_BREDR; } if (sa->l2cap_len != sizeof(*sa)) return (EINVAL); if ((sa->l2cap_psm && sa->l2cap_cid)) return EINVAL; if (bcmp(&sa->l2cap_bdaddr, NG_HCI_BDADDR_ANY, sizeof(bdaddr_t)) == 0) return (EDESTADDRREQ); if((sa->l2cap_bdaddr_type == BDADDR_BREDR)&& (sa->l2cap_psm == 0)) return EDESTADDRREQ; if(sa->l2cap_bdaddr_type != BDADDR_BREDR){ if(sa->l2cap_cid == NG_L2CAP_ATT_CID){ idtype = NG_L2CAP_L2CA_IDTYPE_ATT; }else if (sa->l2cap_cid == NG_L2CAP_SMP_CID){ idtype =NG_L2CAP_L2CA_IDTYPE_SMP; }else{ //if cid == 0 idtype = NG_L2CAP_L2CA_IDTYPE_LE; // Not supported yet return EINVAL; } } if (pcb->psm != 0 && pcb->psm != le16toh(sa->l2cap_psm)) return (EINVAL); /* * Routing. Socket should be bound to some source address. The source * address can be ANY. Destination address must be set and it must not * be ANY. If source address is ANY then find first rtentry that has * src != dst. */ mtx_lock(&ng_btsocket_l2cap_rt_mtx); mtx_lock(&ng_btsocket_l2cap_sockets_mtx); mtx_lock(&pcb->pcb_mtx); /* Send destination address and PSM */ bcopy(&sa->l2cap_bdaddr, &pcb->dst, sizeof(pcb->dst)); pcb->psm = le16toh(sa->l2cap_psm); pcb->dsttype = sa->l2cap_bdaddr_type; pcb->cid = 0; pcb->idtype = idtype; pcb->rt = NULL; have_src = bcmp(&pcb->src, NG_HCI_BDADDR_ANY, sizeof(pcb->src)); LIST_FOREACH(rt, &ng_btsocket_l2cap_rt, next) { if (rt->hook == NULL || NG_HOOK_NOT_VALID(rt->hook)) continue; /* Match src and dst */ if (have_src) { if (bcmp(&pcb->src, &rt->src, sizeof(rt->src)) == 0) break; } else { if (bcmp(&pcb->dst, &rt->src, sizeof(rt->src)) != 0) break; } } if (rt != NULL) { pcb->rt = rt; if (!have_src){ bcopy(&rt->src, &pcb->src, sizeof(pcb->src)); pcb->srctype = (sa->l2cap_bdaddr_type == BDADDR_BREDR)? BDADDR_BREDR : BDADDR_LE_PUBLIC; } } else error = EHOSTUNREACH; /* * Send L2CA_Connect request */ if (error == 0) { error = ng_btsocket_l2cap_send_l2ca_con_req(pcb); if (error == 0) { pcb->flags |= NG_BTSOCKET_L2CAP_CLIENT; pcb->state = NG_BTSOCKET_L2CAP_CONNECTING; soisconnecting(pcb->so); ng_btsocket_l2cap_timeout(pcb); } } mtx_unlock(&pcb->pcb_mtx); mtx_unlock(&ng_btsocket_l2cap_sockets_mtx); mtx_unlock(&ng_btsocket_l2cap_rt_mtx); return (error); } /* ng_btsocket_l2cap_connect */ /* * Process ioctl's calls on socket */ int ng_btsocket_l2cap_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, struct thread *td) { return (EINVAL); } /* ng_btsocket_l2cap_control */ /* * Process getsockopt/setsockopt system calls */ int ng_btsocket_l2cap_ctloutput(struct socket *so, struct sockopt *sopt) { ng_btsocket_l2cap_pcb_p pcb = so2l2cap_pcb(so); int error = 0; ng_l2cap_cfg_opt_val_t v; if (pcb == NULL) return (EINVAL); if (ng_btsocket_l2cap_node == NULL) return (EINVAL); if (sopt->sopt_level != SOL_L2CAP) return (0); mtx_lock(&pcb->pcb_mtx); switch (sopt->sopt_dir) { case SOPT_GET: switch (sopt->sopt_name) { case SO_L2CAP_IMTU: /* get incoming MTU */ error = sooptcopyout(sopt, &pcb->imtu, sizeof(pcb->imtu)); break; case SO_L2CAP_OMTU: /* get outgoing (peer incoming) MTU */ error = sooptcopyout(sopt, &pcb->omtu, sizeof(pcb->omtu)); break; case SO_L2CAP_IFLOW: /* get incoming flow spec. */ error = sooptcopyout(sopt, &pcb->iflow, sizeof(pcb->iflow)); break; case SO_L2CAP_OFLOW: /* get outgoing flow spec. */ error = sooptcopyout(sopt, &pcb->oflow, sizeof(pcb->oflow)); break; case SO_L2CAP_FLUSH: /* get flush timeout */ error = sooptcopyout(sopt, &pcb->flush_timo, sizeof(pcb->flush_timo)); break; case SO_L2CAP_ENCRYPTED: /* get encrypt required */ error = sooptcopyout(sopt, &pcb->need_encrypt, sizeof(pcb->need_encrypt)); break; default: error = ENOPROTOOPT; break; } break; case SOPT_SET: /* * XXX * We do not allow to change these parameters while socket is * connected or we are in the process of creating a connection. * May be this should indicate re-configuration of the open * channel? */ if (pcb->state != NG_BTSOCKET_L2CAP_CLOSED) { error = EACCES; break; } switch (sopt->sopt_name) { case SO_L2CAP_IMTU: /* set incoming MTU */ error = sooptcopyin(sopt, &v, sizeof(v), sizeof(v.mtu)); if (error == 0) pcb->imtu = v.mtu; break; case SO_L2CAP_OFLOW: /* set outgoing flow spec. */ error = sooptcopyin(sopt, &v, sizeof(v),sizeof(v.flow)); if (error == 0) bcopy(&v.flow, &pcb->oflow, sizeof(pcb->oflow)); break; case SO_L2CAP_FLUSH: /* set flush timeout */ error = sooptcopyin(sopt, &v, sizeof(v), sizeof(v.flush_timo)); if (error == 0) pcb->flush_timo = v.flush_timo; break; case SO_L2CAP_ENCRYPTED: /*set connect encryption opt*/ if((pcb->state != NG_BTSOCKET_L2CAP_OPEN) && (pcb->state != NG_BTSOCKET_L2CAP_W4_ENC_CHANGE)){ error = sooptcopyin(sopt, &v, sizeof(v), sizeof(v.encryption)); if(error == 0) pcb->need_encrypt = (v.encryption)?1:0; }else{ error = EINVAL; } break; default: error = ENOPROTOOPT; break; } break; default: error = EINVAL; break; } mtx_unlock(&pcb->pcb_mtx); return (error); } /* ng_btsocket_l2cap_ctloutput */ /* * Detach and destroy socket */ void ng_btsocket_l2cap_detach(struct socket *so) { ng_btsocket_l2cap_pcb_p pcb = so2l2cap_pcb(so); KASSERT(pcb != NULL, ("ng_btsocket_l2cap_detach: pcb == NULL")); if (ng_btsocket_l2cap_node == NULL) return; mtx_lock(&ng_btsocket_l2cap_sockets_mtx); mtx_lock(&pcb->pcb_mtx); /* XXX what to do with pending request? */ if (pcb->flags & NG_BTSOCKET_L2CAP_TIMO) ng_btsocket_l2cap_untimeout(pcb); if (pcb->state != NG_BTSOCKET_L2CAP_CLOSED && pcb->state != NG_BTSOCKET_L2CAP_DISCONNECTING) /* Send disconnect request with "zero" token */ ng_btsocket_l2cap_send_l2ca_discon_req(0, pcb); pcb->state = NG_BTSOCKET_L2CAP_CLOSED; LIST_REMOVE(pcb, next); mtx_unlock(&pcb->pcb_mtx); mtx_unlock(&ng_btsocket_l2cap_sockets_mtx); mtx_destroy(&pcb->pcb_mtx); bzero(pcb, sizeof(*pcb)); free(pcb, M_NETGRAPH_BTSOCKET_L2CAP); soisdisconnected(so); so->so_pcb = NULL; } /* ng_btsocket_l2cap_detach */ /* * Disconnect socket */ int ng_btsocket_l2cap_disconnect(struct socket *so) { ng_btsocket_l2cap_pcb_p pcb = so2l2cap_pcb(so); int error = 0; if (pcb == NULL) return (EINVAL); if (ng_btsocket_l2cap_node == NULL) return (EINVAL); mtx_lock(&pcb->pcb_mtx); if (pcb->state == NG_BTSOCKET_L2CAP_DISCONNECTING) { mtx_unlock(&pcb->pcb_mtx); return (EINPROGRESS); } if (pcb->state != NG_BTSOCKET_L2CAP_CLOSED) { /* XXX FIXME what to do with pending request? */ if (pcb->flags & NG_BTSOCKET_L2CAP_TIMO) ng_btsocket_l2cap_untimeout(pcb); error = ng_btsocket_l2cap_send_l2ca_discon_req(pcb->token, pcb); if (error == 0) { pcb->state = NG_BTSOCKET_L2CAP_DISCONNECTING; soisdisconnecting(so); ng_btsocket_l2cap_timeout(pcb); } /* XXX FIXME what to do if error != 0 */ } mtx_unlock(&pcb->pcb_mtx); return (error); } /* ng_btsocket_l2cap_disconnect */ /* * Listen on socket */ int ng_btsocket_l2cap_listen(struct socket *so, int backlog, struct thread *td) { ng_btsocket_l2cap_pcb_p pcb = so2l2cap_pcb(so); int error; SOCK_LOCK(so); error = solisten_proto_check(so); if (error != 0) goto out; if (pcb == NULL) { error = EINVAL; goto out; } if (ng_btsocket_l2cap_node == NULL) { error = EINVAL; goto out; } if (pcb->psm == 0) { error = EADDRNOTAVAIL; goto out; } solisten_proto(so, backlog); out: SOCK_UNLOCK(so); return (error); } /* ng_btsocket_listen */ /* * Get peer address */ int ng_btsocket_l2cap_peeraddr(struct socket *so, struct sockaddr **nam) { ng_btsocket_l2cap_pcb_p pcb = so2l2cap_pcb(so); struct sockaddr_l2cap sa; if (pcb == NULL) return (EINVAL); if (ng_btsocket_l2cap_node == NULL) return (EINVAL); bcopy(&pcb->dst, &sa.l2cap_bdaddr, sizeof(sa.l2cap_bdaddr)); sa.l2cap_psm = htole16(pcb->psm); sa.l2cap_len = sizeof(sa); sa.l2cap_family = AF_BLUETOOTH; switch(pcb->idtype){ case NG_L2CAP_L2CA_IDTYPE_ATT: sa.l2cap_cid = NG_L2CAP_ATT_CID; break; case NG_L2CAP_L2CA_IDTYPE_SMP: sa.l2cap_cid = NG_L2CAP_SMP_CID; break; default: sa.l2cap_cid = 0; break; } sa.l2cap_bdaddr_type = pcb->dsttype; *nam = sodupsockaddr((struct sockaddr *) &sa, M_NOWAIT); return ((*nam == NULL)? ENOMEM : 0); } /* ng_btsocket_l2cap_peeraddr */ /* * Send data to socket */ int ng_btsocket_l2cap_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, struct mbuf *control, struct thread *td) { ng_btsocket_l2cap_pcb_t *pcb = so2l2cap_pcb(so); int error = 0; if (ng_btsocket_l2cap_node == NULL) { error = ENETDOWN; goto drop; } /* Check socket and input */ if (pcb == NULL || m == NULL || control != NULL) { error = EINVAL; goto drop; } mtx_lock(&pcb->pcb_mtx); /* Make sure socket is connected */ if (pcb->state != NG_BTSOCKET_L2CAP_OPEN) { mtx_unlock(&pcb->pcb_mtx); error = ENOTCONN; goto drop; } /* Check route */ if (pcb->rt == NULL || pcb->rt->hook == NULL || NG_HOOK_NOT_VALID(pcb->rt->hook)) { mtx_unlock(&pcb->pcb_mtx); error = ENETDOWN; goto drop; } /* Check packet size against outgoing (peer's incoming) MTU) */ if (m->m_pkthdr.len > pcb->omtu) { NG_BTSOCKET_L2CAP_ERR( "%s: Packet too big, len=%d, omtu=%d\n", __func__, m->m_pkthdr.len, pcb->omtu); mtx_unlock(&pcb->pcb_mtx); error = EMSGSIZE; goto drop; } /* * First put packet on socket send queue. Then check if we have * pending timeout. If we do not have timeout then we must send * packet and schedule timeout. Otherwise do nothing and wait for * L2CA_WRITE_RSP. */ sbappendrecord(&pcb->so->so_snd, m); m = NULL; if (!(pcb->flags & NG_BTSOCKET_L2CAP_TIMO)) { error = ng_btsocket_l2cap_send2(pcb); if (error == 0) ng_btsocket_l2cap_timeout(pcb); else sbdroprecord(&pcb->so->so_snd); /* XXX */ } mtx_unlock(&pcb->pcb_mtx); drop: NG_FREE_M(m); /* checks for != NULL */ NG_FREE_M(control); return (error); } /* ng_btsocket_l2cap_send */ /* * Send first packet in the socket queue to the L2CAP layer */ static int ng_btsocket_l2cap_send2(ng_btsocket_l2cap_pcb_p pcb) { struct mbuf *m = NULL; ng_l2cap_l2ca_hdr_t *hdr = NULL; int error = 0; mtx_assert(&pcb->pcb_mtx, MA_OWNED); if (sbavail(&pcb->so->so_snd) == 0) return (EINVAL); /* XXX */ m = m_dup(pcb->so->so_snd.sb_mb, M_NOWAIT); if (m == NULL) return (ENOBUFS); /* Create L2CA packet header */ M_PREPEND(m, sizeof(*hdr), M_NOWAIT); if (m != NULL) if (m->m_len < sizeof(*hdr)) m = m_pullup(m, sizeof(*hdr)); if (m == NULL) { NG_BTSOCKET_L2CAP_ERR( "%s: Failed to create L2CA packet header\n", __func__); return (ENOBUFS); } hdr = mtod(m, ng_l2cap_l2ca_hdr_t *); hdr->token = pcb->token; hdr->length = m->m_pkthdr.len - sizeof(*hdr); hdr->lcid = pcb->cid; hdr->idtype = pcb->idtype; NG_BTSOCKET_L2CAP_INFO( "%s: Sending packet: len=%d, length=%d, lcid=%d, token=%d, state=%d\n", __func__, m->m_pkthdr.len, hdr->length, hdr->lcid, hdr->token, pcb->state); /* * If we got here than we have successfully creates new L2CAP * data packet and now we can send it to the L2CAP layer */ NG_SEND_DATA_ONLY(error, pcb->rt->hook, m); return (error); } /* ng_btsocket_l2cap_send2 */ /* * Get socket address */ int ng_btsocket_l2cap_sockaddr(struct socket *so, struct sockaddr **nam) { ng_btsocket_l2cap_pcb_p pcb = so2l2cap_pcb(so); struct sockaddr_l2cap sa; if (pcb == NULL) return (EINVAL); if (ng_btsocket_l2cap_node == NULL) return (EINVAL); bcopy(&pcb->src, &sa.l2cap_bdaddr, sizeof(sa.l2cap_bdaddr)); sa.l2cap_psm = htole16(pcb->psm); sa.l2cap_len = sizeof(sa); sa.l2cap_family = AF_BLUETOOTH; sa.l2cap_cid = 0; sa.l2cap_bdaddr_type = pcb->srctype; *nam = sodupsockaddr((struct sockaddr *) &sa, M_NOWAIT); return ((*nam == NULL)? ENOMEM : 0); } /* ng_btsocket_l2cap_sockaddr */ /***************************************************************************** ***************************************************************************** ** Misc. functions ***************************************************************************** *****************************************************************************/ /* * Look for the socket that listens on given PSM and bdaddr. Returns exact or * close match (if any). Caller must hold ng_btsocket_l2cap_sockets_mtx. */ static ng_btsocket_l2cap_pcb_p ng_btsocket_l2cap_pcb_by_addr(bdaddr_p bdaddr, int psm) { ng_btsocket_l2cap_pcb_p p = NULL, p1 = NULL; mtx_assert(&ng_btsocket_l2cap_sockets_mtx, MA_OWNED); LIST_FOREACH(p, &ng_btsocket_l2cap_sockets, next) { - if (p->so == NULL || !(p->so->so_options & SO_ACCEPTCONN) || - p->psm != psm) + if (p->so == NULL || !SOLISTENING(p->so) || p->psm != psm) continue; if (bcmp(&p->src, bdaddr, sizeof(p->src)) == 0) break; if (bcmp(&p->src, NG_HCI_BDADDR_ANY, sizeof(p->src)) == 0) p1 = p; } return ((p != NULL)? p : p1); } /* ng_btsocket_l2cap_pcb_by_addr */ /* * Look for the socket that has given token. * Caller must hold ng_btsocket_l2cap_sockets_mtx. */ static ng_btsocket_l2cap_pcb_p ng_btsocket_l2cap_pcb_by_token(u_int32_t token) { ng_btsocket_l2cap_pcb_p p = NULL; if (token == 0) return (NULL); mtx_assert(&ng_btsocket_l2cap_sockets_mtx, MA_OWNED); LIST_FOREACH(p, &ng_btsocket_l2cap_sockets, next) if (p->token == token) break; return (p); } /* ng_btsocket_l2cap_pcb_by_token */ /* * Look for the socket that assigned to given source address and channel ID. * Caller must hold ng_btsocket_l2cap_sockets_mtx */ static ng_btsocket_l2cap_pcb_p ng_btsocket_l2cap_pcb_by_cid(bdaddr_p src, int cid, int idtype) { ng_btsocket_l2cap_pcb_p p = NULL; mtx_assert(&ng_btsocket_l2cap_sockets_mtx, MA_OWNED); LIST_FOREACH(p, &ng_btsocket_l2cap_sockets, next){ if (p->cid == cid && bcmp(src, &p->src, sizeof(p->src)) == 0&& p->idtype == idtype) break; } return (p); } /* ng_btsocket_l2cap_pcb_by_cid */ /* * Set timeout on socket */ static void ng_btsocket_l2cap_timeout(ng_btsocket_l2cap_pcb_p pcb) { mtx_assert(&pcb->pcb_mtx, MA_OWNED); if (!(pcb->flags & NG_BTSOCKET_L2CAP_TIMO)) { pcb->flags |= NG_BTSOCKET_L2CAP_TIMO; callout_reset(&pcb->timo, bluetooth_l2cap_ertx_timeout(), ng_btsocket_l2cap_process_timeout, pcb); } else KASSERT(0, ("%s: Duplicated socket timeout?!\n", __func__)); } /* ng_btsocket_l2cap_timeout */ /* * Unset timeout on socket */ static void ng_btsocket_l2cap_untimeout(ng_btsocket_l2cap_pcb_p pcb) { mtx_assert(&pcb->pcb_mtx, MA_OWNED); if (pcb->flags & NG_BTSOCKET_L2CAP_TIMO) { callout_stop(&pcb->timo); pcb->flags &= ~NG_BTSOCKET_L2CAP_TIMO; } else KASSERT(0, ("%s: No socket timeout?!\n", __func__)); } /* ng_btsocket_l2cap_untimeout */ /* * Process timeout on socket */ static void ng_btsocket_l2cap_process_timeout(void *xpcb) { ng_btsocket_l2cap_pcb_p pcb = (ng_btsocket_l2cap_pcb_p) xpcb; mtx_assert(&pcb->pcb_mtx, MA_OWNED); pcb->flags &= ~NG_BTSOCKET_L2CAP_TIMO; pcb->so->so_error = ETIMEDOUT; switch (pcb->state) { case NG_BTSOCKET_L2CAP_CONNECTING: case NG_BTSOCKET_L2CAP_CONFIGURING: case NG_BTSOCKET_L2CAP_W4_ENC_CHANGE: /* Send disconnect request with "zero" token */ if (pcb->cid != 0) ng_btsocket_l2cap_send_l2ca_discon_req(0, pcb); /* ... and close the socket */ pcb->state = NG_BTSOCKET_L2CAP_CLOSED; soisdisconnected(pcb->so); break; case NG_BTSOCKET_L2CAP_OPEN: /* Send timeout - drop packet and wakeup sender */ sbdroprecord(&pcb->so->so_snd); sowwakeup(pcb->so); break; case NG_BTSOCKET_L2CAP_DISCONNECTING: /* Disconnect timeout - disconnect the socket anyway */ pcb->state = NG_BTSOCKET_L2CAP_CLOSED; soisdisconnected(pcb->so); break; default: NG_BTSOCKET_L2CAP_ERR( "%s: Invalid socket state=%d\n", __func__, pcb->state); break; } } /* ng_btsocket_l2cap_process_timeout */ /* * Translate HCI/L2CAP error code into "errno" code * XXX Note: Some L2CAP and HCI error codes have the same value, but * different meaning */ static int ng_btsocket_l2cap_result2errno(int result) { switch (result) { case 0x00: /* No error */ return (0); case 0x01: /* Unknown HCI command */ return (ENODEV); case 0x02: /* No connection */ return (ENOTCONN); case 0x03: /* Hardware failure */ return (EIO); case 0x04: /* Page timeout */ return (EHOSTDOWN); case 0x05: /* Authentication failure */ case 0x06: /* Key missing */ case 0x18: /* Pairing not allowed */ case 0x21: /* Role change not allowed */ case 0x24: /* LMP PSU not allowed */ case 0x25: /* Encryption mode not acceptable */ case 0x26: /* Unit key used */ return (EACCES); case 0x07: /* Memory full */ return (ENOMEM); case 0x08: /* Connection timeout */ case 0x10: /* Host timeout */ case 0x22: /* LMP response timeout */ case 0xee: /* HCI timeout */ case 0xeeee: /* L2CAP timeout */ return (ETIMEDOUT); case 0x09: /* Max number of connections */ case 0x0a: /* Max number of SCO connections to a unit */ return (EMLINK); case 0x0b: /* ACL connection already exists */ return (EEXIST); case 0x0c: /* Command disallowed */ return (EBUSY); case 0x0d: /* Host rejected due to limited resources */ case 0x0e: /* Host rejected due to securiity reasons */ case 0x0f: /* Host rejected due to remote unit is a personal unit */ case 0x1b: /* SCO offset rejected */ case 0x1c: /* SCO interval rejected */ case 0x1d: /* SCO air mode rejected */ return (ECONNREFUSED); case 0x11: /* Unsupported feature or parameter value */ case 0x19: /* Unknown LMP PDU */ case 0x1a: /* Unsupported remote feature */ case 0x20: /* Unsupported LMP parameter value */ case 0x27: /* QoS is not supported */ case 0x29: /* Paring with unit key not supported */ return (EOPNOTSUPP); case 0x12: /* Invalid HCI command parameter */ case 0x1e: /* Invalid LMP parameters */ return (EINVAL); case 0x13: /* Other end terminated connection: User ended connection */ case 0x14: /* Other end terminated connection: Low resources */ case 0x15: /* Other end terminated connection: About to power off */ return (ECONNRESET); case 0x16: /* Connection terminated by local host */ return (ECONNABORTED); #if 0 /* XXX not yet */ case 0x17: /* Repeated attempts */ case 0x1f: /* Unspecified error */ case 0x23: /* LMP error transaction collision */ case 0x28: /* Instant passed */ #endif } return (ENOSYS); } /* ng_btsocket_l2cap_result2errno */ diff --git a/sys/netgraph/bluetooth/socket/ng_btsocket_rfcomm.c b/sys/netgraph/bluetooth/socket/ng_btsocket_rfcomm.c index 95b28e6cc5e7..c0704bce55fa 100644 --- a/sys/netgraph/bluetooth/socket/ng_btsocket_rfcomm.c +++ b/sys/netgraph/bluetooth/socket/ng_btsocket_rfcomm.c @@ -1,3558 +1,3557 @@ /* * ng_btsocket_rfcomm.c */ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2001-2003 Maksim Yevmenkin * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $Id: ng_btsocket_rfcomm.c,v 1.28 2003/09/14 23:29:06 max Exp $ * $FreeBSD$ */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* MALLOC define */ #ifdef NG_SEPARATE_MALLOC static MALLOC_DEFINE(M_NETGRAPH_BTSOCKET_RFCOMM, "netgraph_btsocks_rfcomm", "Netgraph Bluetooth RFCOMM sockets"); #else #define M_NETGRAPH_BTSOCKET_RFCOMM M_NETGRAPH #endif /* NG_SEPARATE_MALLOC */ /* Debug */ #define NG_BTSOCKET_RFCOMM_INFO \ if (ng_btsocket_rfcomm_debug_level >= NG_BTSOCKET_INFO_LEVEL && \ ppsratecheck(&ng_btsocket_rfcomm_lasttime, &ng_btsocket_rfcomm_curpps, 1)) \ printf #define NG_BTSOCKET_RFCOMM_WARN \ if (ng_btsocket_rfcomm_debug_level >= NG_BTSOCKET_WARN_LEVEL && \ ppsratecheck(&ng_btsocket_rfcomm_lasttime, &ng_btsocket_rfcomm_curpps, 1)) \ printf #define NG_BTSOCKET_RFCOMM_ERR \ if (ng_btsocket_rfcomm_debug_level >= NG_BTSOCKET_ERR_LEVEL && \ ppsratecheck(&ng_btsocket_rfcomm_lasttime, &ng_btsocket_rfcomm_curpps, 1)) \ printf #define NG_BTSOCKET_RFCOMM_ALERT \ if (ng_btsocket_rfcomm_debug_level >= NG_BTSOCKET_ALERT_LEVEL && \ ppsratecheck(&ng_btsocket_rfcomm_lasttime, &ng_btsocket_rfcomm_curpps, 1)) \ printf #define ALOT 0x7fff /* Local prototypes */ static int ng_btsocket_rfcomm_upcall (struct socket *so, void *arg, int waitflag); static void ng_btsocket_rfcomm_sessions_task (void *ctx, int pending); static void ng_btsocket_rfcomm_session_task (ng_btsocket_rfcomm_session_p s); #define ng_btsocket_rfcomm_task_wakeup() \ taskqueue_enqueue(taskqueue_swi_giant, &ng_btsocket_rfcomm_task) static ng_btsocket_rfcomm_pcb_p ng_btsocket_rfcomm_connect_ind (ng_btsocket_rfcomm_session_p s, int channel); static void ng_btsocket_rfcomm_connect_cfm (ng_btsocket_rfcomm_session_p s); static int ng_btsocket_rfcomm_session_create (ng_btsocket_rfcomm_session_p *sp, struct socket *l2so, bdaddr_p src, bdaddr_p dst, struct thread *td); static int ng_btsocket_rfcomm_session_accept (ng_btsocket_rfcomm_session_p s0); static int ng_btsocket_rfcomm_session_connect (ng_btsocket_rfcomm_session_p s); static int ng_btsocket_rfcomm_session_receive (ng_btsocket_rfcomm_session_p s); static int ng_btsocket_rfcomm_session_send (ng_btsocket_rfcomm_session_p s); static void ng_btsocket_rfcomm_session_clean (ng_btsocket_rfcomm_session_p s); static void ng_btsocket_rfcomm_session_process_pcb (ng_btsocket_rfcomm_session_p s); static ng_btsocket_rfcomm_session_p ng_btsocket_rfcomm_session_by_addr (bdaddr_p src, bdaddr_p dst); static int ng_btsocket_rfcomm_receive_frame (ng_btsocket_rfcomm_session_p s, struct mbuf *m0); static int ng_btsocket_rfcomm_receive_sabm (ng_btsocket_rfcomm_session_p s, int dlci); static int ng_btsocket_rfcomm_receive_disc (ng_btsocket_rfcomm_session_p s, int dlci); static int ng_btsocket_rfcomm_receive_ua (ng_btsocket_rfcomm_session_p s, int dlci); static int ng_btsocket_rfcomm_receive_dm (ng_btsocket_rfcomm_session_p s, int dlci); static int ng_btsocket_rfcomm_receive_uih (ng_btsocket_rfcomm_session_p s, int dlci, int pf, struct mbuf *m0); static int ng_btsocket_rfcomm_receive_mcc (ng_btsocket_rfcomm_session_p s, struct mbuf *m0); static int ng_btsocket_rfcomm_receive_test (ng_btsocket_rfcomm_session_p s, struct mbuf *m0); static int ng_btsocket_rfcomm_receive_fc (ng_btsocket_rfcomm_session_p s, struct mbuf *m0); static int ng_btsocket_rfcomm_receive_msc (ng_btsocket_rfcomm_session_p s, struct mbuf *m0); static int ng_btsocket_rfcomm_receive_rpn (ng_btsocket_rfcomm_session_p s, struct mbuf *m0); static int ng_btsocket_rfcomm_receive_rls (ng_btsocket_rfcomm_session_p s, struct mbuf *m0); static int ng_btsocket_rfcomm_receive_pn (ng_btsocket_rfcomm_session_p s, struct mbuf *m0); static void ng_btsocket_rfcomm_set_pn (ng_btsocket_rfcomm_pcb_p pcb, u_int8_t cr, u_int8_t flow_control, u_int8_t credits, u_int16_t mtu); static int ng_btsocket_rfcomm_send_command (ng_btsocket_rfcomm_session_p s, u_int8_t type, u_int8_t dlci); static int ng_btsocket_rfcomm_send_uih (ng_btsocket_rfcomm_session_p s, u_int8_t address, u_int8_t pf, u_int8_t credits, struct mbuf *data); static int ng_btsocket_rfcomm_send_msc (ng_btsocket_rfcomm_pcb_p pcb); static int ng_btsocket_rfcomm_send_pn (ng_btsocket_rfcomm_pcb_p pcb); static int ng_btsocket_rfcomm_send_credits (ng_btsocket_rfcomm_pcb_p pcb); static int ng_btsocket_rfcomm_pcb_send (ng_btsocket_rfcomm_pcb_p pcb, int limit); static void ng_btsocket_rfcomm_pcb_kill (ng_btsocket_rfcomm_pcb_p pcb, int error); static ng_btsocket_rfcomm_pcb_p ng_btsocket_rfcomm_pcb_by_dlci (ng_btsocket_rfcomm_session_p s, int dlci); static ng_btsocket_rfcomm_pcb_p ng_btsocket_rfcomm_pcb_listener (bdaddr_p src, int channel); static void ng_btsocket_rfcomm_timeout (ng_btsocket_rfcomm_pcb_p pcb); static void ng_btsocket_rfcomm_untimeout (ng_btsocket_rfcomm_pcb_p pcb); static void ng_btsocket_rfcomm_process_timeout (void *xpcb); static struct mbuf * ng_btsocket_rfcomm_prepare_packet (struct sockbuf *sb, int length); /* Globals */ extern int ifqmaxlen; static u_int32_t ng_btsocket_rfcomm_debug_level; static u_int32_t ng_btsocket_rfcomm_timo; struct task ng_btsocket_rfcomm_task; static LIST_HEAD(, ng_btsocket_rfcomm_session) ng_btsocket_rfcomm_sessions; static struct mtx ng_btsocket_rfcomm_sessions_mtx; static LIST_HEAD(, ng_btsocket_rfcomm_pcb) ng_btsocket_rfcomm_sockets; static struct mtx ng_btsocket_rfcomm_sockets_mtx; static struct timeval ng_btsocket_rfcomm_lasttime; static int ng_btsocket_rfcomm_curpps; /* Sysctl tree */ SYSCTL_DECL(_net_bluetooth_rfcomm_sockets); static SYSCTL_NODE(_net_bluetooth_rfcomm_sockets, OID_AUTO, stream, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Bluetooth STREAM RFCOMM sockets family"); SYSCTL_UINT(_net_bluetooth_rfcomm_sockets_stream, OID_AUTO, debug_level, CTLFLAG_RW, &ng_btsocket_rfcomm_debug_level, NG_BTSOCKET_INFO_LEVEL, "Bluetooth STREAM RFCOMM sockets debug level"); SYSCTL_UINT(_net_bluetooth_rfcomm_sockets_stream, OID_AUTO, timeout, CTLFLAG_RW, &ng_btsocket_rfcomm_timo, 60, "Bluetooth STREAM RFCOMM sockets timeout"); /***************************************************************************** ***************************************************************************** ** RFCOMM CRC ***************************************************************************** *****************************************************************************/ static u_int8_t ng_btsocket_rfcomm_crc_table[256] = { 0x00, 0x91, 0xe3, 0x72, 0x07, 0x96, 0xe4, 0x75, 0x0e, 0x9f, 0xed, 0x7c, 0x09, 0x98, 0xea, 0x7b, 0x1c, 0x8d, 0xff, 0x6e, 0x1b, 0x8a, 0xf8, 0x69, 0x12, 0x83, 0xf1, 0x60, 0x15, 0x84, 0xf6, 0x67, 0x38, 0xa9, 0xdb, 0x4a, 0x3f, 0xae, 0xdc, 0x4d, 0x36, 0xa7, 0xd5, 0x44, 0x31, 0xa0, 0xd2, 0x43, 0x24, 0xb5, 0xc7, 0x56, 0x23, 0xb2, 0xc0, 0x51, 0x2a, 0xbb, 0xc9, 0x58, 0x2d, 0xbc, 0xce, 0x5f, 0x70, 0xe1, 0x93, 0x02, 0x77, 0xe6, 0x94, 0x05, 0x7e, 0xef, 0x9d, 0x0c, 0x79, 0xe8, 0x9a, 0x0b, 0x6c, 0xfd, 0x8f, 0x1e, 0x6b, 0xfa, 0x88, 0x19, 0x62, 0xf3, 0x81, 0x10, 0x65, 0xf4, 0x86, 0x17, 0x48, 0xd9, 0xab, 0x3a, 0x4f, 0xde, 0xac, 0x3d, 0x46, 0xd7, 0xa5, 0x34, 0x41, 0xd0, 0xa2, 0x33, 0x54, 0xc5, 0xb7, 0x26, 0x53, 0xc2, 0xb0, 0x21, 0x5a, 0xcb, 0xb9, 0x28, 0x5d, 0xcc, 0xbe, 0x2f, 0xe0, 0x71, 0x03, 0x92, 0xe7, 0x76, 0x04, 0x95, 0xee, 0x7f, 0x0d, 0x9c, 0xe9, 0x78, 0x0a, 0x9b, 0xfc, 0x6d, 0x1f, 0x8e, 0xfb, 0x6a, 0x18, 0x89, 0xf2, 0x63, 0x11, 0x80, 0xf5, 0x64, 0x16, 0x87, 0xd8, 0x49, 0x3b, 0xaa, 0xdf, 0x4e, 0x3c, 0xad, 0xd6, 0x47, 0x35, 0xa4, 0xd1, 0x40, 0x32, 0xa3, 0xc4, 0x55, 0x27, 0xb6, 0xc3, 0x52, 0x20, 0xb1, 0xca, 0x5b, 0x29, 0xb8, 0xcd, 0x5c, 0x2e, 0xbf, 0x90, 0x01, 0x73, 0xe2, 0x97, 0x06, 0x74, 0xe5, 0x9e, 0x0f, 0x7d, 0xec, 0x99, 0x08, 0x7a, 0xeb, 0x8c, 0x1d, 0x6f, 0xfe, 0x8b, 0x1a, 0x68, 0xf9, 0x82, 0x13, 0x61, 0xf0, 0x85, 0x14, 0x66, 0xf7, 0xa8, 0x39, 0x4b, 0xda, 0xaf, 0x3e, 0x4c, 0xdd, 0xa6, 0x37, 0x45, 0xd4, 0xa1, 0x30, 0x42, 0xd3, 0xb4, 0x25, 0x57, 0xc6, 0xb3, 0x22, 0x50, 0xc1, 0xba, 0x2b, 0x59, 0xc8, 0xbd, 0x2c, 0x5e, 0xcf }; /* CRC */ static u_int8_t ng_btsocket_rfcomm_crc(u_int8_t *data, int length) { u_int8_t crc = 0xff; while (length --) crc = ng_btsocket_rfcomm_crc_table[crc ^ *data++]; return (crc); } /* ng_btsocket_rfcomm_crc */ /* FCS on 2 bytes */ static u_int8_t ng_btsocket_rfcomm_fcs2(u_int8_t *data) { return (0xff - ng_btsocket_rfcomm_crc(data, 2)); } /* ng_btsocket_rfcomm_fcs2 */ /* FCS on 3 bytes */ static u_int8_t ng_btsocket_rfcomm_fcs3(u_int8_t *data) { return (0xff - ng_btsocket_rfcomm_crc(data, 3)); } /* ng_btsocket_rfcomm_fcs3 */ /* * Check FCS * * From Bluetooth spec * * "... In 07.10, the frame check sequence (FCS) is calculated on different * sets of fields for different frame types. These are the fields that the * FCS are calculated on: * * For SABM, DISC, UA, DM frames: on Address, Control and length field. * For UIH frames: on Address and Control field. * * (This is stated here for clarification, and to set the standard for RFCOMM; * the fields included in FCS calculation have actually changed in version * 7.0.0 of TS 07.10, but RFCOMM will not change the FCS calculation scheme * from the one above.) ..." */ static int ng_btsocket_rfcomm_check_fcs(u_int8_t *data, int type, u_int8_t fcs) { if (type != RFCOMM_FRAME_UIH) return (ng_btsocket_rfcomm_fcs3(data) != fcs); return (ng_btsocket_rfcomm_fcs2(data) != fcs); } /* ng_btsocket_rfcomm_check_fcs */ /***************************************************************************** ***************************************************************************** ** Socket interface ***************************************************************************** *****************************************************************************/ /* * Initialize everything */ void ng_btsocket_rfcomm_init(void) { /* Skip initialization of globals for non-default instances. */ if (!IS_DEFAULT_VNET(curvnet)) return; ng_btsocket_rfcomm_debug_level = NG_BTSOCKET_WARN_LEVEL; ng_btsocket_rfcomm_timo = 60; /* RFCOMM task */ TASK_INIT(&ng_btsocket_rfcomm_task, 0, ng_btsocket_rfcomm_sessions_task, NULL); /* RFCOMM sessions list */ LIST_INIT(&ng_btsocket_rfcomm_sessions); mtx_init(&ng_btsocket_rfcomm_sessions_mtx, "btsocks_rfcomm_sessions_mtx", NULL, MTX_DEF); /* RFCOMM sockets list */ LIST_INIT(&ng_btsocket_rfcomm_sockets); mtx_init(&ng_btsocket_rfcomm_sockets_mtx, "btsocks_rfcomm_sockets_mtx", NULL, MTX_DEF); } /* ng_btsocket_rfcomm_init */ /* * Abort connection on socket */ void ng_btsocket_rfcomm_abort(struct socket *so) { so->so_error = ECONNABORTED; (void)ng_btsocket_rfcomm_disconnect(so); } /* ng_btsocket_rfcomm_abort */ void ng_btsocket_rfcomm_close(struct socket *so) { (void)ng_btsocket_rfcomm_disconnect(so); } /* ng_btsocket_rfcomm_close */ /* * Accept connection on socket. Nothing to do here, socket must be connected * and ready, so just return peer address and be done with it. */ int ng_btsocket_rfcomm_accept(struct socket *so, struct sockaddr **nam) { return (ng_btsocket_rfcomm_peeraddr(so, nam)); } /* ng_btsocket_rfcomm_accept */ /* * Create and attach new socket */ int ng_btsocket_rfcomm_attach(struct socket *so, int proto, struct thread *td) { ng_btsocket_rfcomm_pcb_p pcb = so2rfcomm_pcb(so); int error; /* Check socket and protocol */ if (so->so_type != SOCK_STREAM) return (ESOCKTNOSUPPORT); #if 0 /* XXX sonewconn() calls "pru_attach" with proto == 0 */ if (proto != 0) if (proto != BLUETOOTH_PROTO_RFCOMM) return (EPROTONOSUPPORT); #endif /* XXX */ if (pcb != NULL) return (EISCONN); /* Reserve send and receive space if it is not reserved yet */ if ((so->so_snd.sb_hiwat == 0) || (so->so_rcv.sb_hiwat == 0)) { error = soreserve(so, NG_BTSOCKET_RFCOMM_SENDSPACE, NG_BTSOCKET_RFCOMM_RECVSPACE); if (error != 0) return (error); } /* Allocate the PCB */ pcb = malloc(sizeof(*pcb), M_NETGRAPH_BTSOCKET_RFCOMM, M_NOWAIT | M_ZERO); if (pcb == NULL) return (ENOMEM); /* Link the PCB and the socket */ so->so_pcb = (caddr_t) pcb; pcb->so = so; /* Initialize PCB */ pcb->state = NG_BTSOCKET_RFCOMM_DLC_CLOSED; pcb->flags = NG_BTSOCKET_RFCOMM_DLC_CFC; pcb->lmodem = pcb->rmodem = (RFCOMM_MODEM_RTC | RFCOMM_MODEM_RTR | RFCOMM_MODEM_DV); pcb->mtu = RFCOMM_DEFAULT_MTU; pcb->tx_cred = 0; pcb->rx_cred = RFCOMM_DEFAULT_CREDITS; mtx_init(&pcb->pcb_mtx, "btsocks_rfcomm_pcb_mtx", NULL, MTX_DEF); callout_init_mtx(&pcb->timo, &pcb->pcb_mtx, 0); /* Add the PCB to the list */ mtx_lock(&ng_btsocket_rfcomm_sockets_mtx); LIST_INSERT_HEAD(&ng_btsocket_rfcomm_sockets, pcb, next); mtx_unlock(&ng_btsocket_rfcomm_sockets_mtx); return (0); } /* ng_btsocket_rfcomm_attach */ /* * Bind socket */ int ng_btsocket_rfcomm_bind(struct socket *so, struct sockaddr *nam, struct thread *td) { ng_btsocket_rfcomm_pcb_t *pcb = so2rfcomm_pcb(so), *pcb1; struct sockaddr_rfcomm *sa = (struct sockaddr_rfcomm *) nam; if (pcb == NULL) return (EINVAL); /* Verify address */ if (sa == NULL) return (EINVAL); if (sa->rfcomm_family != AF_BLUETOOTH) return (EAFNOSUPPORT); if (sa->rfcomm_len != sizeof(*sa)) return (EINVAL); if (sa->rfcomm_channel > 30) return (EINVAL); mtx_lock(&pcb->pcb_mtx); if (sa->rfcomm_channel != 0) { mtx_lock(&ng_btsocket_rfcomm_sockets_mtx); LIST_FOREACH(pcb1, &ng_btsocket_rfcomm_sockets, next) { if (pcb1->channel == sa->rfcomm_channel && bcmp(&pcb1->src, &sa->rfcomm_bdaddr, sizeof(pcb1->src)) == 0) { mtx_unlock(&ng_btsocket_rfcomm_sockets_mtx); mtx_unlock(&pcb->pcb_mtx); return (EADDRINUSE); } } mtx_unlock(&ng_btsocket_rfcomm_sockets_mtx); } bcopy(&sa->rfcomm_bdaddr, &pcb->src, sizeof(pcb->src)); pcb->channel = sa->rfcomm_channel; mtx_unlock(&pcb->pcb_mtx); return (0); } /* ng_btsocket_rfcomm_bind */ /* * Connect socket */ int ng_btsocket_rfcomm_connect(struct socket *so, struct sockaddr *nam, struct thread *td) { ng_btsocket_rfcomm_pcb_t *pcb = so2rfcomm_pcb(so); struct sockaddr_rfcomm *sa = (struct sockaddr_rfcomm *) nam; ng_btsocket_rfcomm_session_t *s = NULL; struct socket *l2so = NULL; int dlci, error = 0; if (pcb == NULL) return (EINVAL); /* Verify address */ if (sa == NULL) return (EINVAL); if (sa->rfcomm_family != AF_BLUETOOTH) return (EAFNOSUPPORT); if (sa->rfcomm_len != sizeof(*sa)) return (EINVAL); if (sa->rfcomm_channel > 30) return (EINVAL); if (sa->rfcomm_channel == 0 || bcmp(&sa->rfcomm_bdaddr, NG_HCI_BDADDR_ANY, sizeof(bdaddr_t)) == 0) return (EDESTADDRREQ); /* * Note that we will not check for errors in socreate() because * if we failed to create L2CAP socket at this point we still * might have already open session. */ error = socreate(PF_BLUETOOTH, &l2so, SOCK_SEQPACKET, BLUETOOTH_PROTO_L2CAP, td->td_ucred, td); /* * Look for session between "pcb->src" and "sa->rfcomm_bdaddr" (dst) */ mtx_lock(&ng_btsocket_rfcomm_sessions_mtx); s = ng_btsocket_rfcomm_session_by_addr(&pcb->src, &sa->rfcomm_bdaddr); if (s == NULL) { /* * We need to create new RFCOMM session. Check if we have L2CAP * socket. If l2so == NULL then error has the error code from * socreate() */ if (l2so == NULL) { mtx_unlock(&ng_btsocket_rfcomm_sessions_mtx); return (error); } error = ng_btsocket_rfcomm_session_create(&s, l2so, &pcb->src, &sa->rfcomm_bdaddr, td); if (error != 0) { mtx_unlock(&ng_btsocket_rfcomm_sessions_mtx); soclose(l2so); return (error); } } else if (l2so != NULL) soclose(l2so); /* we don't need new L2CAP socket */ /* * Check if we already have the same DLCI the same session */ mtx_lock(&s->session_mtx); mtx_lock(&pcb->pcb_mtx); dlci = RFCOMM_MKDLCI(!INITIATOR(s), sa->rfcomm_channel); if (ng_btsocket_rfcomm_pcb_by_dlci(s, dlci) != NULL) { mtx_unlock(&pcb->pcb_mtx); mtx_unlock(&s->session_mtx); mtx_unlock(&ng_btsocket_rfcomm_sessions_mtx); return (EBUSY); } /* * Check session state and if its not acceptable then refuse connection */ switch (s->state) { case NG_BTSOCKET_RFCOMM_SESSION_CONNECTING: case NG_BTSOCKET_RFCOMM_SESSION_CONNECTED: case NG_BTSOCKET_RFCOMM_SESSION_OPEN: /* * Update destination address and channel and attach * DLC to the session */ bcopy(&sa->rfcomm_bdaddr, &pcb->dst, sizeof(pcb->dst)); pcb->channel = sa->rfcomm_channel; pcb->dlci = dlci; LIST_INSERT_HEAD(&s->dlcs, pcb, session_next); pcb->session = s; ng_btsocket_rfcomm_timeout(pcb); soisconnecting(pcb->so); if (s->state == NG_BTSOCKET_RFCOMM_SESSION_OPEN) { pcb->mtu = s->mtu; bcopy(&so2l2cap_pcb(s->l2so)->src, &pcb->src, sizeof(pcb->src)); pcb->state = NG_BTSOCKET_RFCOMM_DLC_CONFIGURING; error = ng_btsocket_rfcomm_send_pn(pcb); if (error == 0) error = ng_btsocket_rfcomm_task_wakeup(); } else pcb->state = NG_BTSOCKET_RFCOMM_DLC_W4_CONNECT; break; default: error = ECONNRESET; break; } mtx_unlock(&pcb->pcb_mtx); mtx_unlock(&s->session_mtx); mtx_unlock(&ng_btsocket_rfcomm_sessions_mtx); return (error); } /* ng_btsocket_rfcomm_connect */ /* * Process ioctl's calls on socket. * XXX FIXME this should provide interface to the RFCOMM multiplexor channel */ int ng_btsocket_rfcomm_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, struct thread *td) { return (EINVAL); } /* ng_btsocket_rfcomm_control */ /* * Process getsockopt/setsockopt system calls */ int ng_btsocket_rfcomm_ctloutput(struct socket *so, struct sockopt *sopt) { ng_btsocket_rfcomm_pcb_p pcb = so2rfcomm_pcb(so); struct ng_btsocket_rfcomm_fc_info fcinfo; int error = 0; if (pcb == NULL) return (EINVAL); if (sopt->sopt_level != SOL_RFCOMM) return (0); mtx_lock(&pcb->pcb_mtx); switch (sopt->sopt_dir) { case SOPT_GET: switch (sopt->sopt_name) { case SO_RFCOMM_MTU: error = sooptcopyout(sopt, &pcb->mtu, sizeof(pcb->mtu)); break; case SO_RFCOMM_FC_INFO: fcinfo.lmodem = pcb->lmodem; fcinfo.rmodem = pcb->rmodem; fcinfo.tx_cred = pcb->tx_cred; fcinfo.rx_cred = pcb->rx_cred; fcinfo.cfc = (pcb->flags & NG_BTSOCKET_RFCOMM_DLC_CFC)? 1 : 0; fcinfo.reserved = 0; error = sooptcopyout(sopt, &fcinfo, sizeof(fcinfo)); break; default: error = ENOPROTOOPT; break; } break; case SOPT_SET: switch (sopt->sopt_name) { default: error = ENOPROTOOPT; break; } break; default: error = EINVAL; break; } mtx_unlock(&pcb->pcb_mtx); return (error); } /* ng_btsocket_rfcomm_ctloutput */ /* * Detach and destroy socket */ void ng_btsocket_rfcomm_detach(struct socket *so) { ng_btsocket_rfcomm_pcb_p pcb = so2rfcomm_pcb(so); KASSERT(pcb != NULL, ("ng_btsocket_rfcomm_detach: pcb == NULL")); mtx_lock(&pcb->pcb_mtx); switch (pcb->state) { case NG_BTSOCKET_RFCOMM_DLC_W4_CONNECT: case NG_BTSOCKET_RFCOMM_DLC_CONFIGURING: case NG_BTSOCKET_RFCOMM_DLC_CONNECTING: case NG_BTSOCKET_RFCOMM_DLC_CONNECTED: /* XXX What to do with pending request? */ if (pcb->flags & NG_BTSOCKET_RFCOMM_DLC_TIMO) ng_btsocket_rfcomm_untimeout(pcb); if (pcb->state == NG_BTSOCKET_RFCOMM_DLC_W4_CONNECT) pcb->flags |= NG_BTSOCKET_RFCOMM_DLC_DETACHED; else pcb->state = NG_BTSOCKET_RFCOMM_DLC_DISCONNECTING; ng_btsocket_rfcomm_task_wakeup(); break; case NG_BTSOCKET_RFCOMM_DLC_DISCONNECTING: ng_btsocket_rfcomm_task_wakeup(); break; } while (pcb->state != NG_BTSOCKET_RFCOMM_DLC_CLOSED) msleep(&pcb->state, &pcb->pcb_mtx, PZERO, "rf_det", 0); if (pcb->session != NULL) panic("%s: pcb->session != NULL\n", __func__); if (pcb->flags & NG_BTSOCKET_RFCOMM_DLC_TIMO) panic("%s: timeout on closed DLC, flags=%#x\n", __func__, pcb->flags); mtx_lock(&ng_btsocket_rfcomm_sockets_mtx); LIST_REMOVE(pcb, next); mtx_unlock(&ng_btsocket_rfcomm_sockets_mtx); mtx_unlock(&pcb->pcb_mtx); mtx_destroy(&pcb->pcb_mtx); bzero(pcb, sizeof(*pcb)); free(pcb, M_NETGRAPH_BTSOCKET_RFCOMM); soisdisconnected(so); so->so_pcb = NULL; } /* ng_btsocket_rfcomm_detach */ /* * Disconnect socket */ int ng_btsocket_rfcomm_disconnect(struct socket *so) { ng_btsocket_rfcomm_pcb_p pcb = so2rfcomm_pcb(so); if (pcb == NULL) return (EINVAL); mtx_lock(&pcb->pcb_mtx); if (pcb->state == NG_BTSOCKET_RFCOMM_DLC_DISCONNECTING) { mtx_unlock(&pcb->pcb_mtx); return (EINPROGRESS); } /* XXX What to do with pending request? */ if (pcb->flags & NG_BTSOCKET_RFCOMM_DLC_TIMO) ng_btsocket_rfcomm_untimeout(pcb); switch (pcb->state) { case NG_BTSOCKET_RFCOMM_DLC_CONFIGURING: /* XXX can we get here? */ case NG_BTSOCKET_RFCOMM_DLC_CONNECTING: /* XXX can we get here? */ case NG_BTSOCKET_RFCOMM_DLC_CONNECTED: /* * Just change DLC state and enqueue RFCOMM task. It will * queue and send DISC on the DLC. */ pcb->state = NG_BTSOCKET_RFCOMM_DLC_DISCONNECTING; soisdisconnecting(so); ng_btsocket_rfcomm_task_wakeup(); break; case NG_BTSOCKET_RFCOMM_DLC_CLOSED: case NG_BTSOCKET_RFCOMM_DLC_W4_CONNECT: break; default: panic("%s: Invalid DLC state=%d, flags=%#x\n", __func__, pcb->state, pcb->flags); break; } mtx_unlock(&pcb->pcb_mtx); return (0); } /* ng_btsocket_rfcomm_disconnect */ /* * Listen on socket. First call to listen() will create listening RFCOMM session */ int ng_btsocket_rfcomm_listen(struct socket *so, int backlog, struct thread *td) { ng_btsocket_rfcomm_pcb_p pcb = so2rfcomm_pcb(so), pcb1; ng_btsocket_rfcomm_session_p s = NULL; struct socket *l2so = NULL; int error, socreate_error, usedchannels; if (pcb == NULL) return (EINVAL); if (pcb->channel > 30) return (EADDRNOTAVAIL); usedchannels = 0; mtx_lock(&pcb->pcb_mtx); if (pcb->channel == 0) { mtx_lock(&ng_btsocket_rfcomm_sockets_mtx); LIST_FOREACH(pcb1, &ng_btsocket_rfcomm_sockets, next) if (pcb1->channel != 0 && bcmp(&pcb1->src, &pcb->src, sizeof(pcb->src)) == 0) usedchannels |= (1 << (pcb1->channel - 1)); for (pcb->channel = 30; pcb->channel > 0; pcb->channel --) if (!(usedchannels & (1 << (pcb->channel - 1)))) break; if (pcb->channel == 0) { mtx_unlock(&ng_btsocket_rfcomm_sockets_mtx); mtx_unlock(&pcb->pcb_mtx); return (EADDRNOTAVAIL); } mtx_unlock(&ng_btsocket_rfcomm_sockets_mtx); } mtx_unlock(&pcb->pcb_mtx); /* * Note that we will not check for errors in socreate() because * if we failed to create L2CAP socket at this point we still * might have already open session. */ socreate_error = socreate(PF_BLUETOOTH, &l2so, SOCK_SEQPACKET, BLUETOOTH_PROTO_L2CAP, td->td_ucred, td); /* * Transition the socket and session into the LISTENING state. Check * for collisions first, as there can only be one. */ mtx_lock(&ng_btsocket_rfcomm_sessions_mtx); SOCK_LOCK(so); error = solisten_proto_check(so); SOCK_UNLOCK(so); if (error != 0) goto out; LIST_FOREACH(s, &ng_btsocket_rfcomm_sessions, next) if (s->state == NG_BTSOCKET_RFCOMM_SESSION_LISTENING) break; if (s == NULL) { /* * We need to create default RFCOMM session. Check if we have * L2CAP socket. If l2so == NULL then error has the error code * from socreate() */ if (l2so == NULL) { error = socreate_error; goto out; } /* * Create default listen RFCOMM session. The default RFCOMM * session will listen on ANY address. * * XXX FIXME Note that currently there is no way to adjust MTU * for the default session. */ error = ng_btsocket_rfcomm_session_create(&s, l2so, NG_HCI_BDADDR_ANY, NULL, td); if (error != 0) goto out; l2so = NULL; } SOCK_LOCK(so); solisten_proto(so, backlog); SOCK_UNLOCK(so); out: mtx_unlock(&ng_btsocket_rfcomm_sessions_mtx); /* * If we still have an l2so reference here, it's unneeded, so release * it. */ if (l2so != NULL) soclose(l2so); return (error); } /* ng_btsocket_listen */ /* * Get peer address */ int ng_btsocket_rfcomm_peeraddr(struct socket *so, struct sockaddr **nam) { ng_btsocket_rfcomm_pcb_p pcb = so2rfcomm_pcb(so); struct sockaddr_rfcomm sa; if (pcb == NULL) return (EINVAL); bcopy(&pcb->dst, &sa.rfcomm_bdaddr, sizeof(sa.rfcomm_bdaddr)); sa.rfcomm_channel = pcb->channel; sa.rfcomm_len = sizeof(sa); sa.rfcomm_family = AF_BLUETOOTH; *nam = sodupsockaddr((struct sockaddr *) &sa, M_NOWAIT); return ((*nam == NULL)? ENOMEM : 0); } /* ng_btsocket_rfcomm_peeraddr */ /* * Send data to socket */ int ng_btsocket_rfcomm_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, struct mbuf *control, struct thread *td) { ng_btsocket_rfcomm_pcb_t *pcb = so2rfcomm_pcb(so); int error = 0; /* Check socket and input */ if (pcb == NULL || m == NULL || control != NULL) { error = EINVAL; goto drop; } mtx_lock(&pcb->pcb_mtx); /* Make sure DLC is connected */ if (pcb->state != NG_BTSOCKET_RFCOMM_DLC_CONNECTED) { mtx_unlock(&pcb->pcb_mtx); error = ENOTCONN; goto drop; } /* Put the packet on the socket's send queue and wakeup RFCOMM task */ sbappend(&pcb->so->so_snd, m, flags); m = NULL; if (!(pcb->flags & NG_BTSOCKET_RFCOMM_DLC_SENDING)) { pcb->flags |= NG_BTSOCKET_RFCOMM_DLC_SENDING; error = ng_btsocket_rfcomm_task_wakeup(); } mtx_unlock(&pcb->pcb_mtx); drop: NG_FREE_M(m); /* checks for != NULL */ NG_FREE_M(control); return (error); } /* ng_btsocket_rfcomm_send */ /* * Get socket address */ int ng_btsocket_rfcomm_sockaddr(struct socket *so, struct sockaddr **nam) { ng_btsocket_rfcomm_pcb_p pcb = so2rfcomm_pcb(so); struct sockaddr_rfcomm sa; if (pcb == NULL) return (EINVAL); bcopy(&pcb->src, &sa.rfcomm_bdaddr, sizeof(sa.rfcomm_bdaddr)); sa.rfcomm_channel = pcb->channel; sa.rfcomm_len = sizeof(sa); sa.rfcomm_family = AF_BLUETOOTH; *nam = sodupsockaddr((struct sockaddr *) &sa, M_NOWAIT); return ((*nam == NULL)? ENOMEM : 0); } /* ng_btsocket_rfcomm_sockaddr */ /* * Upcall function for L2CAP sockets. Enqueue RFCOMM task. */ static int ng_btsocket_rfcomm_upcall(struct socket *so, void *arg, int waitflag) { int error; if (so == NULL) panic("%s: so == NULL\n", __func__); if ((error = ng_btsocket_rfcomm_task_wakeup()) != 0) NG_BTSOCKET_RFCOMM_ALERT( "%s: Could not enqueue RFCOMM task, error=%d\n", __func__, error); return (SU_OK); } /* ng_btsocket_rfcomm_upcall */ /* * RFCOMM task. Will handle all RFCOMM sessions in one pass. * XXX FIXME does not scale very well */ static void ng_btsocket_rfcomm_sessions_task(void *ctx, int pending) { ng_btsocket_rfcomm_session_p s = NULL, s_next = NULL; mtx_lock(&ng_btsocket_rfcomm_sessions_mtx); for (s = LIST_FIRST(&ng_btsocket_rfcomm_sessions); s != NULL; ) { mtx_lock(&s->session_mtx); s_next = LIST_NEXT(s, next); ng_btsocket_rfcomm_session_task(s); if (s->state == NG_BTSOCKET_RFCOMM_SESSION_CLOSED) { /* Unlink and clean the session */ LIST_REMOVE(s, next); NG_BT_MBUFQ_DRAIN(&s->outq); if (!LIST_EMPTY(&s->dlcs)) panic("%s: DLC list is not empty\n", __func__); /* Close L2CAP socket */ SOCKBUF_LOCK(&s->l2so->so_rcv); soupcall_clear(s->l2so, SO_RCV); SOCKBUF_UNLOCK(&s->l2so->so_rcv); SOCKBUF_LOCK(&s->l2so->so_snd); soupcall_clear(s->l2so, SO_SND); SOCKBUF_UNLOCK(&s->l2so->so_snd); soclose(s->l2so); mtx_unlock(&s->session_mtx); mtx_destroy(&s->session_mtx); bzero(s, sizeof(*s)); free(s, M_NETGRAPH_BTSOCKET_RFCOMM); } else mtx_unlock(&s->session_mtx); s = s_next; } mtx_unlock(&ng_btsocket_rfcomm_sessions_mtx); } /* ng_btsocket_rfcomm_sessions_task */ /* * Process RFCOMM session. Will handle all RFCOMM sockets in one pass. */ static void ng_btsocket_rfcomm_session_task(ng_btsocket_rfcomm_session_p s) { mtx_assert(&s->session_mtx, MA_OWNED); if (s->l2so->so_rcv.sb_state & SBS_CANTRCVMORE) { NG_BTSOCKET_RFCOMM_INFO( "%s: L2CAP connection has been terminated, so=%p, so_state=%#x, so_count=%d, " \ "state=%d, flags=%#x\n", __func__, s->l2so, s->l2so->so_state, s->l2so->so_count, s->state, s->flags); s->state = NG_BTSOCKET_RFCOMM_SESSION_CLOSED; ng_btsocket_rfcomm_session_clean(s); } /* Now process upcall */ switch (s->state) { /* Try to accept new L2CAP connection(s) */ case NG_BTSOCKET_RFCOMM_SESSION_LISTENING: while (ng_btsocket_rfcomm_session_accept(s) == 0) ; break; /* Process the results of the L2CAP connect */ case NG_BTSOCKET_RFCOMM_SESSION_CONNECTING: ng_btsocket_rfcomm_session_process_pcb(s); if (ng_btsocket_rfcomm_session_connect(s) != 0) { s->state = NG_BTSOCKET_RFCOMM_SESSION_CLOSED; ng_btsocket_rfcomm_session_clean(s); } break; /* Try to receive/send more data */ case NG_BTSOCKET_RFCOMM_SESSION_CONNECTED: case NG_BTSOCKET_RFCOMM_SESSION_OPEN: case NG_BTSOCKET_RFCOMM_SESSION_DISCONNECTING: ng_btsocket_rfcomm_session_process_pcb(s); if (ng_btsocket_rfcomm_session_receive(s) != 0) { s->state = NG_BTSOCKET_RFCOMM_SESSION_CLOSED; ng_btsocket_rfcomm_session_clean(s); } else if (ng_btsocket_rfcomm_session_send(s) != 0) { s->state = NG_BTSOCKET_RFCOMM_SESSION_CLOSED; ng_btsocket_rfcomm_session_clean(s); } break; case NG_BTSOCKET_RFCOMM_SESSION_CLOSED: break; default: panic("%s: Invalid session state=%d, flags=%#x\n", __func__, s->state, s->flags); break; } } /* ng_btsocket_rfcomm_session_task */ /* * Process RFCOMM connection indicator. Caller must hold s->session_mtx */ static ng_btsocket_rfcomm_pcb_p ng_btsocket_rfcomm_connect_ind(ng_btsocket_rfcomm_session_p s, int channel) { ng_btsocket_rfcomm_pcb_p pcb = NULL, pcb1 = NULL; ng_btsocket_l2cap_pcb_p l2pcb = NULL; struct socket *so1; mtx_assert(&s->session_mtx, MA_OWNED); /* * Try to find RFCOMM socket that listens on given source address * and channel. This will return the best possible match. */ l2pcb = so2l2cap_pcb(s->l2so); pcb = ng_btsocket_rfcomm_pcb_listener(&l2pcb->src, channel); if (pcb == NULL) return (NULL); /* * Check the pending connections queue and if we have space then * create new socket and set proper source and destination address, * and channel. */ mtx_lock(&pcb->pcb_mtx); CURVNET_SET(pcb->so->so_vnet); so1 = sonewconn(pcb->so, 0); CURVNET_RESTORE(); mtx_unlock(&pcb->pcb_mtx); if (so1 == NULL) return (NULL); /* * If we got here than we have created new socket. So complete the * connection. Set source and destination address from the session. */ pcb1 = so2rfcomm_pcb(so1); if (pcb1 == NULL) panic("%s: pcb1 == NULL\n", __func__); mtx_lock(&pcb1->pcb_mtx); bcopy(&l2pcb->src, &pcb1->src, sizeof(pcb1->src)); bcopy(&l2pcb->dst, &pcb1->dst, sizeof(pcb1->dst)); pcb1->channel = channel; /* Link new DLC to the session. We already hold s->session_mtx */ LIST_INSERT_HEAD(&s->dlcs, pcb1, session_next); pcb1->session = s; mtx_unlock(&pcb1->pcb_mtx); return (pcb1); } /* ng_btsocket_rfcomm_connect_ind */ /* * Process RFCOMM connect confirmation. Caller must hold s->session_mtx. */ static void ng_btsocket_rfcomm_connect_cfm(ng_btsocket_rfcomm_session_p s) { ng_btsocket_rfcomm_pcb_p pcb = NULL, pcb_next = NULL; int error; mtx_assert(&s->session_mtx, MA_OWNED); /* * Wake up all waiting sockets and send PN request for each of them. * Note that timeout already been set in ng_btsocket_rfcomm_connect() * * Note: cannot use LIST_FOREACH because ng_btsocket_rfcomm_pcb_kill * will unlink DLC from the session */ for (pcb = LIST_FIRST(&s->dlcs); pcb != NULL; ) { mtx_lock(&pcb->pcb_mtx); pcb_next = LIST_NEXT(pcb, session_next); if (pcb->state == NG_BTSOCKET_RFCOMM_DLC_W4_CONNECT) { pcb->mtu = s->mtu; bcopy(&so2l2cap_pcb(s->l2so)->src, &pcb->src, sizeof(pcb->src)); error = ng_btsocket_rfcomm_send_pn(pcb); if (error == 0) pcb->state = NG_BTSOCKET_RFCOMM_DLC_CONFIGURING; else ng_btsocket_rfcomm_pcb_kill(pcb, error); } mtx_unlock(&pcb->pcb_mtx); pcb = pcb_next; } } /* ng_btsocket_rfcomm_connect_cfm */ /***************************************************************************** ***************************************************************************** ** RFCOMM sessions ***************************************************************************** *****************************************************************************/ /* * Create new RFCOMM session. That function WILL NOT take ownership over l2so. * Caller MUST free l2so if function failed. */ static int ng_btsocket_rfcomm_session_create(ng_btsocket_rfcomm_session_p *sp, struct socket *l2so, bdaddr_p src, bdaddr_p dst, struct thread *td) { ng_btsocket_rfcomm_session_p s = NULL; struct sockaddr_l2cap l2sa; struct sockopt l2sopt; int error; u_int16_t mtu; mtx_assert(&ng_btsocket_rfcomm_sessions_mtx, MA_OWNED); /* Allocate the RFCOMM session */ s = malloc(sizeof(*s), M_NETGRAPH_BTSOCKET_RFCOMM, M_NOWAIT | M_ZERO); if (s == NULL) return (ENOMEM); /* Set defaults */ s->mtu = RFCOMM_DEFAULT_MTU; s->flags = 0; s->state = NG_BTSOCKET_RFCOMM_SESSION_CLOSED; NG_BT_MBUFQ_INIT(&s->outq, ifqmaxlen); /* * XXX Mark session mutex as DUPOK to prevent "duplicated lock of * the same type" message. When accepting new L2CAP connection * ng_btsocket_rfcomm_session_accept() holds both session mutexes * for "old" (accepting) session and "new" (created) session. */ mtx_init(&s->session_mtx, "btsocks_rfcomm_session_mtx", NULL, MTX_DEF|MTX_DUPOK); LIST_INIT(&s->dlcs); /* Prepare L2CAP socket */ SOCKBUF_LOCK(&l2so->so_rcv); soupcall_set(l2so, SO_RCV, ng_btsocket_rfcomm_upcall, NULL); SOCKBUF_UNLOCK(&l2so->so_rcv); SOCKBUF_LOCK(&l2so->so_snd); soupcall_set(l2so, SO_SND, ng_btsocket_rfcomm_upcall, NULL); SOCKBUF_UNLOCK(&l2so->so_snd); l2so->so_state |= SS_NBIO; s->l2so = l2so; mtx_lock(&s->session_mtx); /* * "src" == NULL and "dst" == NULL means just create session. * caller must do the rest */ if (src == NULL && dst == NULL) goto done; /* * Set incoming MTU on L2CAP socket. It is RFCOMM session default MTU * plus 5 bytes: RFCOMM frame header, one extra byte for length and one * extra byte for credits. */ mtu = s->mtu + sizeof(struct rfcomm_frame_hdr) + 1 + 1; l2sopt.sopt_dir = SOPT_SET; l2sopt.sopt_level = SOL_L2CAP; l2sopt.sopt_name = SO_L2CAP_IMTU; l2sopt.sopt_val = (void *) &mtu; l2sopt.sopt_valsize = sizeof(mtu); l2sopt.sopt_td = NULL; error = sosetopt(s->l2so, &l2sopt); if (error != 0) goto bad; /* Bind socket to "src" address */ l2sa.l2cap_len = sizeof(l2sa); l2sa.l2cap_family = AF_BLUETOOTH; l2sa.l2cap_psm = (dst == NULL)? htole16(NG_L2CAP_PSM_RFCOMM) : 0; bcopy(src, &l2sa.l2cap_bdaddr, sizeof(l2sa.l2cap_bdaddr)); l2sa.l2cap_cid = 0; l2sa.l2cap_bdaddr_type = BDADDR_BREDR; error = sobind(s->l2so, (struct sockaddr *) &l2sa, td); if (error != 0) goto bad; /* If "dst" is not NULL then initiate connect(), otherwise listen() */ if (dst == NULL) { s->flags = 0; s->state = NG_BTSOCKET_RFCOMM_SESSION_LISTENING; error = solisten(s->l2so, 10, td); if (error != 0) goto bad; } else { s->flags = NG_BTSOCKET_RFCOMM_SESSION_INITIATOR; s->state = NG_BTSOCKET_RFCOMM_SESSION_CONNECTING; l2sa.l2cap_len = sizeof(l2sa); l2sa.l2cap_family = AF_BLUETOOTH; l2sa.l2cap_psm = htole16(NG_L2CAP_PSM_RFCOMM); bcopy(dst, &l2sa.l2cap_bdaddr, sizeof(l2sa.l2cap_bdaddr)); l2sa.l2cap_cid = 0; l2sa.l2cap_bdaddr_type = BDADDR_BREDR; error = soconnect(s->l2so, (struct sockaddr *) &l2sa, td); if (error != 0) goto bad; } done: LIST_INSERT_HEAD(&ng_btsocket_rfcomm_sessions, s, next); *sp = s; mtx_unlock(&s->session_mtx); return (0); bad: mtx_unlock(&s->session_mtx); /* Return L2CAP socket back to its original state */ SOCKBUF_LOCK(&l2so->so_rcv); soupcall_clear(s->l2so, SO_RCV); SOCKBUF_UNLOCK(&l2so->so_rcv); SOCKBUF_LOCK(&l2so->so_snd); soupcall_clear(s->l2so, SO_SND); SOCKBUF_UNLOCK(&l2so->so_snd); l2so->so_state &= ~SS_NBIO; mtx_destroy(&s->session_mtx); bzero(s, sizeof(*s)); free(s, M_NETGRAPH_BTSOCKET_RFCOMM); return (error); } /* ng_btsocket_rfcomm_session_create */ /* * Process accept() on RFCOMM session * XXX FIXME locking for "l2so"? */ static int ng_btsocket_rfcomm_session_accept(ng_btsocket_rfcomm_session_p s0) { struct socket *l2so; struct sockaddr_l2cap *l2sa = NULL; ng_btsocket_l2cap_pcb_t *l2pcb = NULL; ng_btsocket_rfcomm_session_p s = NULL; int error; mtx_assert(&ng_btsocket_rfcomm_sessions_mtx, MA_OWNED); mtx_assert(&s0->session_mtx, MA_OWNED); SOLISTEN_LOCK(s0->l2so); error = solisten_dequeue(s0->l2so, &l2so, 0); if (error == EWOULDBLOCK) return (error); if (error) { NG_BTSOCKET_RFCOMM_ERR( "%s: Could not accept connection on L2CAP socket, error=%d\n", __func__, error); return (error); } error = soaccept(l2so, (struct sockaddr **) &l2sa); if (error != 0) { NG_BTSOCKET_RFCOMM_ERR( "%s: soaccept() on L2CAP socket failed, error=%d\n", __func__, error); soclose(l2so); return (error); } /* * Check if there is already active RFCOMM session between two devices. * If so then close L2CAP connection. We only support one RFCOMM session * between each pair of devices. Note that here we assume session in any * state. The session even could be in the middle of disconnecting. */ l2pcb = so2l2cap_pcb(l2so); s = ng_btsocket_rfcomm_session_by_addr(&l2pcb->src, &l2pcb->dst); if (s == NULL) { /* Create a new RFCOMM session */ error = ng_btsocket_rfcomm_session_create(&s, l2so, NULL, NULL, curthread /* XXX */); if (error == 0) { mtx_lock(&s->session_mtx); s->flags = 0; s->state = NG_BTSOCKET_RFCOMM_SESSION_CONNECTED; /* * Adjust MTU on incoming connection. Reserve 5 bytes: * RFCOMM frame header, one extra byte for length and * one extra byte for credits. */ s->mtu = min(l2pcb->imtu, l2pcb->omtu) - sizeof(struct rfcomm_frame_hdr) - 1 - 1; mtx_unlock(&s->session_mtx); } else { NG_BTSOCKET_RFCOMM_ALERT( "%s: Failed to create new RFCOMM session, error=%d\n", __func__, error); soclose(l2so); } } else { NG_BTSOCKET_RFCOMM_WARN( "%s: Rejecting duplicating RFCOMM session between src=%x:%x:%x:%x:%x:%x and " \ "dst=%x:%x:%x:%x:%x:%x, state=%d, flags=%#x\n", __func__, l2pcb->src.b[5], l2pcb->src.b[4], l2pcb->src.b[3], l2pcb->src.b[2], l2pcb->src.b[1], l2pcb->src.b[0], l2pcb->dst.b[5], l2pcb->dst.b[4], l2pcb->dst.b[3], l2pcb->dst.b[2], l2pcb->dst.b[1], l2pcb->dst.b[0], s->state, s->flags); error = EBUSY; soclose(l2so); } return (error); } /* ng_btsocket_rfcomm_session_accept */ /* * Process connect() on RFCOMM session * XXX FIXME locking for "l2so"? */ static int ng_btsocket_rfcomm_session_connect(ng_btsocket_rfcomm_session_p s) { ng_btsocket_l2cap_pcb_p l2pcb = so2l2cap_pcb(s->l2so); int error; mtx_assert(&s->session_mtx, MA_OWNED); /* First check if connection has failed */ if ((error = s->l2so->so_error) != 0) { s->l2so->so_error = 0; NG_BTSOCKET_RFCOMM_ERR( "%s: Could not connect RFCOMM session, error=%d, state=%d, flags=%#x\n", __func__, error, s->state, s->flags); return (error); } /* Is connection still in progress? */ if (s->l2so->so_state & SS_ISCONNECTING) return (0); /* * If we got here then we are connected. Send SABM on DLCI 0 to * open multiplexor channel. */ if (error == 0) { s->state = NG_BTSOCKET_RFCOMM_SESSION_CONNECTED; /* * Adjust MTU on outgoing connection. Reserve 5 bytes: RFCOMM * frame header, one extra byte for length and one extra byte * for credits. */ s->mtu = min(l2pcb->imtu, l2pcb->omtu) - sizeof(struct rfcomm_frame_hdr) - 1 - 1; error = ng_btsocket_rfcomm_send_command(s,RFCOMM_FRAME_SABM,0); if (error == 0) error = ng_btsocket_rfcomm_task_wakeup(); } return (error); }/* ng_btsocket_rfcomm_session_connect */ /* * Receive data on RFCOMM session * XXX FIXME locking for "l2so"? */ static int ng_btsocket_rfcomm_session_receive(ng_btsocket_rfcomm_session_p s) { struct mbuf *m = NULL; struct uio uio; int more, flags, error; mtx_assert(&s->session_mtx, MA_OWNED); /* Can we read from the L2CAP socket? */ if (!soreadable(s->l2so)) return (0); /* First check for error on L2CAP socket */ if ((error = s->l2so->so_error) != 0) { s->l2so->so_error = 0; NG_BTSOCKET_RFCOMM_ERR( "%s: Could not receive data from L2CAP socket, error=%d, state=%d, flags=%#x\n", __func__, error, s->state, s->flags); return (error); } /* * Read all packets from the L2CAP socket. * XXX FIXME/VERIFY is that correct? For now use m->m_nextpkt as * indication that there is more packets on the socket's buffer. * Also what should we use in uio.uio_resid? * May be s->mtu + sizeof(struct rfcomm_frame_hdr) + 1 + 1? */ for (more = 1; more; ) { /* Try to get next packet from socket */ bzero(&uio, sizeof(uio)); /* uio.uio_td = NULL; */ uio.uio_resid = 1000000000; flags = MSG_DONTWAIT; m = NULL; error = soreceive(s->l2so, NULL, &uio, &m, (struct mbuf **) NULL, &flags); if (error != 0) { if (error == EWOULDBLOCK) return (0); /* XXX can happen? */ NG_BTSOCKET_RFCOMM_ERR( "%s: Could not receive data from L2CAP socket, error=%d\n", __func__, error); return (error); } more = (m->m_nextpkt != NULL); m->m_nextpkt = NULL; ng_btsocket_rfcomm_receive_frame(s, m); } return (0); } /* ng_btsocket_rfcomm_session_receive */ /* * Send data on RFCOMM session * XXX FIXME locking for "l2so"? */ static int ng_btsocket_rfcomm_session_send(ng_btsocket_rfcomm_session_p s) { struct mbuf *m = NULL; int error; mtx_assert(&s->session_mtx, MA_OWNED); /* Send as much as we can from the session queue */ while (sowriteable(s->l2so)) { /* Check if socket still OK */ if ((error = s->l2so->so_error) != 0) { s->l2so->so_error = 0; NG_BTSOCKET_RFCOMM_ERR( "%s: Detected error=%d on L2CAP socket, state=%d, flags=%#x\n", __func__, error, s->state, s->flags); return (error); } NG_BT_MBUFQ_DEQUEUE(&s->outq, m); if (m == NULL) return (0); /* we are done */ /* Call send function on the L2CAP socket */ error = (*s->l2so->so_proto->pr_usrreqs->pru_send)(s->l2so, 0, m, NULL, NULL, curthread /* XXX */); if (error != 0) { NG_BTSOCKET_RFCOMM_ERR( "%s: Could not send data to L2CAP socket, error=%d\n", __func__, error); return (error); } } return (0); } /* ng_btsocket_rfcomm_session_send */ /* * Close and disconnect all DLCs for the given session. Caller must hold * s->sesson_mtx. Will wakeup session. */ static void ng_btsocket_rfcomm_session_clean(ng_btsocket_rfcomm_session_p s) { ng_btsocket_rfcomm_pcb_p pcb = NULL, pcb_next = NULL; int error; mtx_assert(&s->session_mtx, MA_OWNED); /* * Note: cannot use LIST_FOREACH because ng_btsocket_rfcomm_pcb_kill * will unlink DLC from the session */ for (pcb = LIST_FIRST(&s->dlcs); pcb != NULL; ) { mtx_lock(&pcb->pcb_mtx); pcb_next = LIST_NEXT(pcb, session_next); NG_BTSOCKET_RFCOMM_INFO( "%s: Disconnecting dlci=%d, state=%d, flags=%#x\n", __func__, pcb->dlci, pcb->state, pcb->flags); if (pcb->state == NG_BTSOCKET_RFCOMM_DLC_CONNECTED) error = ECONNRESET; else error = ECONNREFUSED; ng_btsocket_rfcomm_pcb_kill(pcb, error); mtx_unlock(&pcb->pcb_mtx); pcb = pcb_next; } } /* ng_btsocket_rfcomm_session_clean */ /* * Process all DLCs on the session. Caller MUST hold s->session_mtx. */ static void ng_btsocket_rfcomm_session_process_pcb(ng_btsocket_rfcomm_session_p s) { ng_btsocket_rfcomm_pcb_p pcb = NULL, pcb_next = NULL; int error; mtx_assert(&s->session_mtx, MA_OWNED); /* * Note: cannot use LIST_FOREACH because ng_btsocket_rfcomm_pcb_kill * will unlink DLC from the session */ for (pcb = LIST_FIRST(&s->dlcs); pcb != NULL; ) { mtx_lock(&pcb->pcb_mtx); pcb_next = LIST_NEXT(pcb, session_next); switch (pcb->state) { /* * If DLC in W4_CONNECT state then we should check for both * timeout and detach. */ case NG_BTSOCKET_RFCOMM_DLC_W4_CONNECT: if (pcb->flags & NG_BTSOCKET_RFCOMM_DLC_DETACHED) ng_btsocket_rfcomm_pcb_kill(pcb, 0); else if (pcb->flags & NG_BTSOCKET_RFCOMM_DLC_TIMEDOUT) ng_btsocket_rfcomm_pcb_kill(pcb, ETIMEDOUT); break; /* * If DLC in CONFIGURING or CONNECTING state then we only * should check for timeout. If detach() was called then * DLC will be moved into DISCONNECTING state. */ case NG_BTSOCKET_RFCOMM_DLC_CONFIGURING: case NG_BTSOCKET_RFCOMM_DLC_CONNECTING: if (pcb->flags & NG_BTSOCKET_RFCOMM_DLC_TIMEDOUT) ng_btsocket_rfcomm_pcb_kill(pcb, ETIMEDOUT); break; /* * If DLC in CONNECTED state then we need to send data (if any) * from the socket's send queue. Note that we will send data * from either all sockets or none. This may overload session's * outgoing queue (but we do not check for that). * * XXX FIXME need scheduler for RFCOMM sockets */ case NG_BTSOCKET_RFCOMM_DLC_CONNECTED: error = ng_btsocket_rfcomm_pcb_send(pcb, ALOT); if (error != 0) ng_btsocket_rfcomm_pcb_kill(pcb, error); break; /* * If DLC in DISCONNECTING state then we must send DISC frame. * Note that if DLC has timeout set then we do not need to * resend DISC frame. * * XXX FIXME need to drain all data from the socket's queue * if LINGER option was set */ case NG_BTSOCKET_RFCOMM_DLC_DISCONNECTING: if (!(pcb->flags & NG_BTSOCKET_RFCOMM_DLC_TIMO)) { error = ng_btsocket_rfcomm_send_command( pcb->session, RFCOMM_FRAME_DISC, pcb->dlci); if (error == 0) ng_btsocket_rfcomm_timeout(pcb); else ng_btsocket_rfcomm_pcb_kill(pcb, error); } else if (pcb->flags & NG_BTSOCKET_RFCOMM_DLC_TIMEDOUT) ng_btsocket_rfcomm_pcb_kill(pcb, ETIMEDOUT); break; /* case NG_BTSOCKET_RFCOMM_DLC_CLOSED: */ default: panic("%s: Invalid DLC state=%d, flags=%#x\n", __func__, pcb->state, pcb->flags); break; } mtx_unlock(&pcb->pcb_mtx); pcb = pcb_next; } } /* ng_btsocket_rfcomm_session_process_pcb */ /* * Find RFCOMM session between "src" and "dst". * Caller MUST hold ng_btsocket_rfcomm_sessions_mtx. */ static ng_btsocket_rfcomm_session_p ng_btsocket_rfcomm_session_by_addr(bdaddr_p src, bdaddr_p dst) { ng_btsocket_rfcomm_session_p s = NULL; ng_btsocket_l2cap_pcb_p l2pcb = NULL; int any_src; mtx_assert(&ng_btsocket_rfcomm_sessions_mtx, MA_OWNED); any_src = (bcmp(src, NG_HCI_BDADDR_ANY, sizeof(*src)) == 0); LIST_FOREACH(s, &ng_btsocket_rfcomm_sessions, next) { l2pcb = so2l2cap_pcb(s->l2so); if ((any_src || bcmp(&l2pcb->src, src, sizeof(*src)) == 0) && bcmp(&l2pcb->dst, dst, sizeof(*dst)) == 0) break; } return (s); } /* ng_btsocket_rfcomm_session_by_addr */ /***************************************************************************** ***************************************************************************** ** RFCOMM ***************************************************************************** *****************************************************************************/ /* * Process incoming RFCOMM frame. Caller must hold s->session_mtx. * XXX FIXME check frame length */ static int ng_btsocket_rfcomm_receive_frame(ng_btsocket_rfcomm_session_p s, struct mbuf *m0) { struct rfcomm_frame_hdr *hdr = NULL; struct mbuf *m = NULL; u_int16_t length; u_int8_t dlci, type; int error = 0; mtx_assert(&s->session_mtx, MA_OWNED); /* Pullup as much as we can into first mbuf (for direct access) */ length = min(m0->m_pkthdr.len, MHLEN); if (m0->m_len < length) { if ((m0 = m_pullup(m0, length)) == NULL) { NG_BTSOCKET_RFCOMM_ALERT( "%s: m_pullup(%d) failed\n", __func__, length); return (ENOBUFS); } } hdr = mtod(m0, struct rfcomm_frame_hdr *); dlci = RFCOMM_DLCI(hdr->address); type = RFCOMM_TYPE(hdr->control); /* Test EA bit in length. If not set then we have 2 bytes of length */ if (!RFCOMM_EA(hdr->length)) { bcopy(&hdr->length, &length, sizeof(length)); length = le16toh(length) >> 1; m_adj(m0, sizeof(*hdr) + 1); } else { length = hdr->length >> 1; m_adj(m0, sizeof(*hdr)); } NG_BTSOCKET_RFCOMM_INFO( "%s: Got frame type=%#x, dlci=%d, length=%d, cr=%d, pf=%d, len=%d\n", __func__, type, dlci, length, RFCOMM_CR(hdr->address), RFCOMM_PF(hdr->control), m0->m_pkthdr.len); /* * Get FCS (the last byte in the frame) * XXX this will not work if mbuf chain ends with empty mbuf. * XXX let's hope it never happens :) */ for (m = m0; m->m_next != NULL; m = m->m_next) ; if (m->m_len <= 0) panic("%s: Empty mbuf at the end of the chain, len=%d\n", __func__, m->m_len); /* * Check FCS. We only need to calculate FCS on first 2 or 3 bytes * and already m_pullup'ed mbuf chain, so it should be safe. */ if (ng_btsocket_rfcomm_check_fcs((u_int8_t *) hdr, type, m->m_data[m->m_len - 1])) { NG_BTSOCKET_RFCOMM_ERR( "%s: Invalid RFCOMM packet. Bad checksum\n", __func__); NG_FREE_M(m0); return (EINVAL); } m_adj(m0, -1); /* Trim FCS byte */ /* * Process RFCOMM frame. * * From TS 07.10 spec * * "... In the case where a SABM or DISC command with the P bit set * to 0 is received then the received frame shall be discarded..." * * "... If a unsolicited DM response is received then the frame shall * be processed irrespective of the P/F setting... " * * "... The station may transmit response frames with the F bit set * to 0 at any opportunity on an asynchronous basis. However, in the * case where a UA response is received with the F bit set to 0 then * the received frame shall be discarded..." * * From Bluetooth spec * * "... When credit based flow control is being used, the meaning of * the P/F bit in the control field of the RFCOMM header is redefined * for UIH frames..." */ switch (type) { case RFCOMM_FRAME_SABM: if (RFCOMM_PF(hdr->control)) error = ng_btsocket_rfcomm_receive_sabm(s, dlci); break; case RFCOMM_FRAME_DISC: if (RFCOMM_PF(hdr->control)) error = ng_btsocket_rfcomm_receive_disc(s, dlci); break; case RFCOMM_FRAME_UA: if (RFCOMM_PF(hdr->control)) error = ng_btsocket_rfcomm_receive_ua(s, dlci); break; case RFCOMM_FRAME_DM: error = ng_btsocket_rfcomm_receive_dm(s, dlci); break; case RFCOMM_FRAME_UIH: if (dlci == 0) error = ng_btsocket_rfcomm_receive_mcc(s, m0); else error = ng_btsocket_rfcomm_receive_uih(s, dlci, RFCOMM_PF(hdr->control), m0); return (error); /* NOT REACHED */ default: NG_BTSOCKET_RFCOMM_ERR( "%s: Invalid RFCOMM packet. Unknown type=%#x\n", __func__, type); error = EINVAL; break; } NG_FREE_M(m0); return (error); } /* ng_btsocket_rfcomm_receive_frame */ /* * Process RFCOMM SABM frame */ static int ng_btsocket_rfcomm_receive_sabm(ng_btsocket_rfcomm_session_p s, int dlci) { ng_btsocket_rfcomm_pcb_p pcb = NULL; int error = 0; mtx_assert(&s->session_mtx, MA_OWNED); NG_BTSOCKET_RFCOMM_INFO( "%s: Got SABM, session state=%d, flags=%#x, mtu=%d, dlci=%d\n", __func__, s->state, s->flags, s->mtu, dlci); /* DLCI == 0 means open multiplexor channel */ if (dlci == 0) { switch (s->state) { case NG_BTSOCKET_RFCOMM_SESSION_CONNECTED: case NG_BTSOCKET_RFCOMM_SESSION_OPEN: error = ng_btsocket_rfcomm_send_command(s, RFCOMM_FRAME_UA, dlci); if (error == 0) { s->state = NG_BTSOCKET_RFCOMM_SESSION_OPEN; ng_btsocket_rfcomm_connect_cfm(s); } else { s->state = NG_BTSOCKET_RFCOMM_SESSION_CLOSED; ng_btsocket_rfcomm_session_clean(s); } break; default: NG_BTSOCKET_RFCOMM_WARN( "%s: Got SABM for session in invalid state state=%d, flags=%#x\n", __func__, s->state, s->flags); error = EINVAL; break; } return (error); } /* Make sure multiplexor channel is open */ if (s->state != NG_BTSOCKET_RFCOMM_SESSION_OPEN) { NG_BTSOCKET_RFCOMM_ERR( "%s: Got SABM for dlci=%d with mulitplexor channel closed, state=%d, " \ "flags=%#x\n", __func__, dlci, s->state, s->flags); return (EINVAL); } /* * Check if we have this DLCI. This might happen when remote * peer uses PN command before actual open (SABM) happens. */ pcb = ng_btsocket_rfcomm_pcb_by_dlci(s, dlci); if (pcb != NULL) { mtx_lock(&pcb->pcb_mtx); if (pcb->state != NG_BTSOCKET_RFCOMM_DLC_CONNECTING) { NG_BTSOCKET_RFCOMM_ERR( "%s: Got SABM for dlci=%d in invalid state=%d, flags=%#x\n", __func__, dlci, pcb->state, pcb->flags); mtx_unlock(&pcb->pcb_mtx); return (ENOENT); } ng_btsocket_rfcomm_untimeout(pcb); error = ng_btsocket_rfcomm_send_command(s,RFCOMM_FRAME_UA,dlci); if (error == 0) error = ng_btsocket_rfcomm_send_msc(pcb); if (error == 0) { pcb->state = NG_BTSOCKET_RFCOMM_DLC_CONNECTED; soisconnected(pcb->so); } else ng_btsocket_rfcomm_pcb_kill(pcb, error); mtx_unlock(&pcb->pcb_mtx); return (error); } /* * We do not have requested DLCI, so it must be an incoming connection * with default parameters. Try to accept it. */ pcb = ng_btsocket_rfcomm_connect_ind(s, RFCOMM_SRVCHANNEL(dlci)); if (pcb != NULL) { mtx_lock(&pcb->pcb_mtx); pcb->dlci = dlci; error = ng_btsocket_rfcomm_send_command(s,RFCOMM_FRAME_UA,dlci); if (error == 0) error = ng_btsocket_rfcomm_send_msc(pcb); if (error == 0) { pcb->state = NG_BTSOCKET_RFCOMM_DLC_CONNECTED; soisconnected(pcb->so); } else ng_btsocket_rfcomm_pcb_kill(pcb, error); mtx_unlock(&pcb->pcb_mtx); } else /* Nobody is listen()ing on the requested DLCI */ error = ng_btsocket_rfcomm_send_command(s,RFCOMM_FRAME_DM,dlci); return (error); } /* ng_btsocket_rfcomm_receive_sabm */ /* * Process RFCOMM DISC frame */ static int ng_btsocket_rfcomm_receive_disc(ng_btsocket_rfcomm_session_p s, int dlci) { ng_btsocket_rfcomm_pcb_p pcb = NULL; int error = 0; mtx_assert(&s->session_mtx, MA_OWNED); NG_BTSOCKET_RFCOMM_INFO( "%s: Got DISC, session state=%d, flags=%#x, mtu=%d, dlci=%d\n", __func__, s->state, s->flags, s->mtu, dlci); /* DLCI == 0 means close multiplexor channel */ if (dlci == 0) { /* XXX FIXME assume that remote side will close the socket */ error = ng_btsocket_rfcomm_send_command(s, RFCOMM_FRAME_UA, 0); if (error == 0) { if (s->state == NG_BTSOCKET_RFCOMM_SESSION_DISCONNECTING) s->state = NG_BTSOCKET_RFCOMM_SESSION_CLOSED; /* XXX */ else s->state = NG_BTSOCKET_RFCOMM_SESSION_DISCONNECTING; } else s->state = NG_BTSOCKET_RFCOMM_SESSION_CLOSED; /* XXX */ ng_btsocket_rfcomm_session_clean(s); } else { pcb = ng_btsocket_rfcomm_pcb_by_dlci(s, dlci); if (pcb != NULL) { int err; mtx_lock(&pcb->pcb_mtx); NG_BTSOCKET_RFCOMM_INFO( "%s: Got DISC for dlci=%d, state=%d, flags=%#x\n", __func__, dlci, pcb->state, pcb->flags); error = ng_btsocket_rfcomm_send_command(s, RFCOMM_FRAME_UA, dlci); if (pcb->state == NG_BTSOCKET_RFCOMM_DLC_CONNECTED) err = 0; else err = ECONNREFUSED; ng_btsocket_rfcomm_pcb_kill(pcb, err); mtx_unlock(&pcb->pcb_mtx); } else { NG_BTSOCKET_RFCOMM_WARN( "%s: Got DISC for non-existing dlci=%d\n", __func__, dlci); error = ng_btsocket_rfcomm_send_command(s, RFCOMM_FRAME_DM, dlci); } } return (error); } /* ng_btsocket_rfcomm_receive_disc */ /* * Process RFCOMM UA frame */ static int ng_btsocket_rfcomm_receive_ua(ng_btsocket_rfcomm_session_p s, int dlci) { ng_btsocket_rfcomm_pcb_p pcb = NULL; int error = 0; mtx_assert(&s->session_mtx, MA_OWNED); NG_BTSOCKET_RFCOMM_INFO( "%s: Got UA, session state=%d, flags=%#x, mtu=%d, dlci=%d\n", __func__, s->state, s->flags, s->mtu, dlci); /* dlci == 0 means multiplexor channel */ if (dlci == 0) { switch (s->state) { case NG_BTSOCKET_RFCOMM_SESSION_CONNECTED: s->state = NG_BTSOCKET_RFCOMM_SESSION_OPEN; ng_btsocket_rfcomm_connect_cfm(s); break; case NG_BTSOCKET_RFCOMM_SESSION_DISCONNECTING: s->state = NG_BTSOCKET_RFCOMM_SESSION_CLOSED; ng_btsocket_rfcomm_session_clean(s); break; default: NG_BTSOCKET_RFCOMM_WARN( "%s: Got UA for session in invalid state=%d(%d), flags=%#x, mtu=%d\n", __func__, s->state, INITIATOR(s), s->flags, s->mtu); error = ENOENT; break; } return (error); } /* Check if we have this DLCI */ pcb = ng_btsocket_rfcomm_pcb_by_dlci(s, dlci); if (pcb != NULL) { mtx_lock(&pcb->pcb_mtx); NG_BTSOCKET_RFCOMM_INFO( "%s: Got UA for dlci=%d, state=%d, flags=%#x\n", __func__, dlci, pcb->state, pcb->flags); switch (pcb->state) { case NG_BTSOCKET_RFCOMM_DLC_CONNECTING: ng_btsocket_rfcomm_untimeout(pcb); error = ng_btsocket_rfcomm_send_msc(pcb); if (error == 0) { pcb->state = NG_BTSOCKET_RFCOMM_DLC_CONNECTED; soisconnected(pcb->so); } break; case NG_BTSOCKET_RFCOMM_DLC_DISCONNECTING: ng_btsocket_rfcomm_pcb_kill(pcb, 0); break; default: NG_BTSOCKET_RFCOMM_WARN( "%s: Got UA for dlci=%d in invalid state=%d, flags=%#x\n", __func__, dlci, pcb->state, pcb->flags); error = ENOENT; break; } mtx_unlock(&pcb->pcb_mtx); } else { NG_BTSOCKET_RFCOMM_WARN( "%s: Got UA for non-existing dlci=%d\n", __func__, dlci); error = ng_btsocket_rfcomm_send_command(s,RFCOMM_FRAME_DM,dlci); } return (error); } /* ng_btsocket_rfcomm_receive_ua */ /* * Process RFCOMM DM frame */ static int ng_btsocket_rfcomm_receive_dm(ng_btsocket_rfcomm_session_p s, int dlci) { ng_btsocket_rfcomm_pcb_p pcb = NULL; int error; mtx_assert(&s->session_mtx, MA_OWNED); NG_BTSOCKET_RFCOMM_INFO( "%s: Got DM, session state=%d, flags=%#x, mtu=%d, dlci=%d\n", __func__, s->state, s->flags, s->mtu, dlci); /* DLCI == 0 means multiplexor channel */ if (dlci == 0) { /* Disconnect all dlc's on the session */ s->state = NG_BTSOCKET_RFCOMM_SESSION_CLOSED; ng_btsocket_rfcomm_session_clean(s); } else { pcb = ng_btsocket_rfcomm_pcb_by_dlci(s, dlci); if (pcb != NULL) { mtx_lock(&pcb->pcb_mtx); NG_BTSOCKET_RFCOMM_INFO( "%s: Got DM for dlci=%d, state=%d, flags=%#x\n", __func__, dlci, pcb->state, pcb->flags); if (pcb->state == NG_BTSOCKET_RFCOMM_DLC_CONNECTED) error = ECONNRESET; else error = ECONNREFUSED; ng_btsocket_rfcomm_pcb_kill(pcb, error); mtx_unlock(&pcb->pcb_mtx); } else NG_BTSOCKET_RFCOMM_WARN( "%s: Got DM for non-existing dlci=%d\n", __func__, dlci); } return (0); } /* ng_btsocket_rfcomm_receive_dm */ /* * Process RFCOMM UIH frame (data) */ static int ng_btsocket_rfcomm_receive_uih(ng_btsocket_rfcomm_session_p s, int dlci, int pf, struct mbuf *m0) { ng_btsocket_rfcomm_pcb_p pcb = NULL; int error = 0; mtx_assert(&s->session_mtx, MA_OWNED); NG_BTSOCKET_RFCOMM_INFO( "%s: Got UIH, session state=%d, flags=%#x, mtu=%d, dlci=%d, pf=%d, len=%d\n", __func__, s->state, s->flags, s->mtu, dlci, pf, m0->m_pkthdr.len); /* XXX should we do it here? Check for session flow control */ if (s->flags & NG_BTSOCKET_RFCOMM_SESSION_LFC) { NG_BTSOCKET_RFCOMM_WARN( "%s: Got UIH with session flow control asserted, state=%d, flags=%#x\n", __func__, s->state, s->flags); goto drop; } /* Check if we have this dlci */ pcb = ng_btsocket_rfcomm_pcb_by_dlci(s, dlci); if (pcb == NULL) { NG_BTSOCKET_RFCOMM_WARN( "%s: Got UIH for non-existing dlci=%d\n", __func__, dlci); error = ng_btsocket_rfcomm_send_command(s,RFCOMM_FRAME_DM,dlci); goto drop; } mtx_lock(&pcb->pcb_mtx); /* Check dlci state */ if (pcb->state != NG_BTSOCKET_RFCOMM_DLC_CONNECTED) { NG_BTSOCKET_RFCOMM_WARN( "%s: Got UIH for dlci=%d in invalid state=%d, flags=%#x\n", __func__, dlci, pcb->state, pcb->flags); error = EINVAL; goto drop1; } /* Check dlci flow control */ if (((pcb->flags & NG_BTSOCKET_RFCOMM_DLC_CFC) && pcb->rx_cred <= 0) || (pcb->lmodem & RFCOMM_MODEM_FC)) { NG_BTSOCKET_RFCOMM_ERR( "%s: Got UIH for dlci=%d with asserted flow control, state=%d, " \ "flags=%#x, rx_cred=%d, lmodem=%#x\n", __func__, dlci, pcb->state, pcb->flags, pcb->rx_cred, pcb->lmodem); goto drop1; } /* Did we get any credits? */ if ((pcb->flags & NG_BTSOCKET_RFCOMM_DLC_CFC) && pf) { NG_BTSOCKET_RFCOMM_INFO( "%s: Got %d more credits for dlci=%d, state=%d, flags=%#x, " \ "rx_cred=%d, tx_cred=%d\n", __func__, *mtod(m0, u_int8_t *), dlci, pcb->state, pcb->flags, pcb->rx_cred, pcb->tx_cred); pcb->tx_cred += *mtod(m0, u_int8_t *); m_adj(m0, 1); /* Send more from the DLC. XXX check for errors? */ ng_btsocket_rfcomm_pcb_send(pcb, ALOT); } /* OK the of the rest of the mbuf is the data */ if (m0->m_pkthdr.len > 0) { /* If we are using credit flow control decrease rx_cred here */ if (pcb->flags & NG_BTSOCKET_RFCOMM_DLC_CFC) { /* Give remote peer more credits (if needed) */ if (-- pcb->rx_cred <= RFCOMM_MAX_CREDITS / 2) ng_btsocket_rfcomm_send_credits(pcb); else NG_BTSOCKET_RFCOMM_INFO( "%s: Remote side still has credits, dlci=%d, state=%d, flags=%#x, " \ "rx_cred=%d, tx_cred=%d\n", __func__, dlci, pcb->state, pcb->flags, pcb->rx_cred, pcb->tx_cred); } /* Check packet against mtu on dlci */ if (m0->m_pkthdr.len > pcb->mtu) { NG_BTSOCKET_RFCOMM_ERR( "%s: Got oversized UIH for dlci=%d, state=%d, flags=%#x, mtu=%d, len=%d\n", __func__, dlci, pcb->state, pcb->flags, pcb->mtu, m0->m_pkthdr.len); error = EMSGSIZE; } else if (m0->m_pkthdr.len > sbspace(&pcb->so->so_rcv)) { /* * This is really bad. Receive queue on socket does * not have enough space for the packet. We do not * have any other choice but drop the packet. */ NG_BTSOCKET_RFCOMM_ERR( "%s: Not enough space in socket receive queue. Dropping UIH for dlci=%d, " \ "state=%d, flags=%#x, len=%d, space=%ld\n", __func__, dlci, pcb->state, pcb->flags, m0->m_pkthdr.len, sbspace(&pcb->so->so_rcv)); error = ENOBUFS; } else { /* Append packet to the socket receive queue */ sbappend(&pcb->so->so_rcv, m0, 0); m0 = NULL; sorwakeup(pcb->so); } } drop1: mtx_unlock(&pcb->pcb_mtx); drop: NG_FREE_M(m0); /* checks for != NULL */ return (error); } /* ng_btsocket_rfcomm_receive_uih */ /* * Process RFCOMM MCC command (Multiplexor) * * From TS 07.10 spec * * "5.4.3.1 Information Data * * ...The frames (UIH) sent by the initiating station have the C/R bit set * to 1 and those sent by the responding station have the C/R bit set to 0..." * * "5.4.6.2 Operating procedures * * Messages always exist in pairs; a command message and a corresponding * response message. If the C/R bit is set to 1 the message is a command, * if it is set to 0 the message is a response... * * ... * * NOTE: Notice that when UIH frames are used to convey information on DLCI 0 * there are at least two different fields that contain a C/R bit, and the * bits are set of different form. The C/R bit in the Type field shall be set * as it is stated above, while the C/R bit in the Address field (see subclause * 5.2.1.2) shall be set as it is described in subclause 5.4.3.1." */ static int ng_btsocket_rfcomm_receive_mcc(ng_btsocket_rfcomm_session_p s, struct mbuf *m0) { struct rfcomm_mcc_hdr *hdr = NULL; u_int8_t cr, type, length; mtx_assert(&s->session_mtx, MA_OWNED); /* * We can access data directly in the first mbuf, because we have * m_pullup()'ed mbuf chain in ng_btsocket_rfcomm_receive_frame(). * All MCC commands should fit into single mbuf (except probably TEST). */ hdr = mtod(m0, struct rfcomm_mcc_hdr *); cr = RFCOMM_CR(hdr->type); type = RFCOMM_MCC_TYPE(hdr->type); length = RFCOMM_MCC_LENGTH(hdr->length); /* Check MCC frame length */ if (sizeof(*hdr) + length != m0->m_pkthdr.len) { NG_BTSOCKET_RFCOMM_ERR( "%s: Invalid MCC frame length=%d, len=%d\n", __func__, length, m0->m_pkthdr.len); NG_FREE_M(m0); return (EMSGSIZE); } switch (type) { case RFCOMM_MCC_TEST: return (ng_btsocket_rfcomm_receive_test(s, m0)); /* NOT REACHED */ case RFCOMM_MCC_FCON: case RFCOMM_MCC_FCOFF: return (ng_btsocket_rfcomm_receive_fc(s, m0)); /* NOT REACHED */ case RFCOMM_MCC_MSC: return (ng_btsocket_rfcomm_receive_msc(s, m0)); /* NOT REACHED */ case RFCOMM_MCC_RPN: return (ng_btsocket_rfcomm_receive_rpn(s, m0)); /* NOT REACHED */ case RFCOMM_MCC_RLS: return (ng_btsocket_rfcomm_receive_rls(s, m0)); /* NOT REACHED */ case RFCOMM_MCC_PN: return (ng_btsocket_rfcomm_receive_pn(s, m0)); /* NOT REACHED */ case RFCOMM_MCC_NSC: NG_BTSOCKET_RFCOMM_ERR( "%s: Got MCC NSC, type=%#x, cr=%d, length=%d, session state=%d, flags=%#x, " \ "mtu=%d, len=%d\n", __func__, RFCOMM_MCC_TYPE(*((u_int8_t *)(hdr + 1))), cr, length, s->state, s->flags, s->mtu, m0->m_pkthdr.len); NG_FREE_M(m0); break; default: NG_BTSOCKET_RFCOMM_ERR( "%s: Got unknown MCC, type=%#x, cr=%d, length=%d, session state=%d, " \ "flags=%#x, mtu=%d, len=%d\n", __func__, type, cr, length, s->state, s->flags, s->mtu, m0->m_pkthdr.len); /* Reuse mbuf to send NSC */ hdr = mtod(m0, struct rfcomm_mcc_hdr *); m0->m_pkthdr.len = m0->m_len = sizeof(*hdr); /* Create MCC NSC header */ hdr->type = RFCOMM_MKMCC_TYPE(0, RFCOMM_MCC_NSC); hdr->length = RFCOMM_MKLEN8(1); /* Put back MCC command type we did not like */ m0->m_data[m0->m_len] = RFCOMM_MKMCC_TYPE(cr, type); m0->m_pkthdr.len ++; m0->m_len ++; /* Send UIH frame */ return (ng_btsocket_rfcomm_send_uih(s, RFCOMM_MKADDRESS(INITIATOR(s), 0), 0, 0, m0)); /* NOT REACHED */ } return (0); } /* ng_btsocket_rfcomm_receive_mcc */ /* * Receive RFCOMM TEST MCC command */ static int ng_btsocket_rfcomm_receive_test(ng_btsocket_rfcomm_session_p s, struct mbuf *m0) { struct rfcomm_mcc_hdr *hdr = mtod(m0, struct rfcomm_mcc_hdr *); int error = 0; mtx_assert(&s->session_mtx, MA_OWNED); NG_BTSOCKET_RFCOMM_INFO( "%s: Got MCC TEST, cr=%d, length=%d, session state=%d, flags=%#x, mtu=%d, " \ "len=%d\n", __func__, RFCOMM_CR(hdr->type), RFCOMM_MCC_LENGTH(hdr->length), s->state, s->flags, s->mtu, m0->m_pkthdr.len); if (RFCOMM_CR(hdr->type)) { hdr->type = RFCOMM_MKMCC_TYPE(0, RFCOMM_MCC_TEST); error = ng_btsocket_rfcomm_send_uih(s, RFCOMM_MKADDRESS(INITIATOR(s), 0), 0, 0, m0); } else NG_FREE_M(m0); /* XXX ignore response */ return (error); } /* ng_btsocket_rfcomm_receive_test */ /* * Receive RFCOMM FCON/FCOFF MCC command */ static int ng_btsocket_rfcomm_receive_fc(ng_btsocket_rfcomm_session_p s, struct mbuf *m0) { struct rfcomm_mcc_hdr *hdr = mtod(m0, struct rfcomm_mcc_hdr *); u_int8_t type = RFCOMM_MCC_TYPE(hdr->type); int error = 0; mtx_assert(&s->session_mtx, MA_OWNED); /* * Turn ON/OFF aggregate flow on the entire session. When remote peer * asserted flow control no transmission shall occur except on dlci 0 * (control channel). */ NG_BTSOCKET_RFCOMM_INFO( "%s: Got MCC FC%s, cr=%d, length=%d, session state=%d, flags=%#x, mtu=%d, " \ "len=%d\n", __func__, (type == RFCOMM_MCC_FCON)? "ON" : "OFF", RFCOMM_CR(hdr->type), RFCOMM_MCC_LENGTH(hdr->length), s->state, s->flags, s->mtu, m0->m_pkthdr.len); if (RFCOMM_CR(hdr->type)) { if (type == RFCOMM_MCC_FCON) s->flags &= ~NG_BTSOCKET_RFCOMM_SESSION_RFC; else s->flags |= NG_BTSOCKET_RFCOMM_SESSION_RFC; hdr->type = RFCOMM_MKMCC_TYPE(0, type); error = ng_btsocket_rfcomm_send_uih(s, RFCOMM_MKADDRESS(INITIATOR(s), 0), 0, 0, m0); } else NG_FREE_M(m0); /* XXX ignore response */ return (error); } /* ng_btsocket_rfcomm_receive_fc */ /* * Receive RFCOMM MSC MCC command */ static int ng_btsocket_rfcomm_receive_msc(ng_btsocket_rfcomm_session_p s, struct mbuf *m0) { struct rfcomm_mcc_hdr *hdr = mtod(m0, struct rfcomm_mcc_hdr*); struct rfcomm_mcc_msc *msc = (struct rfcomm_mcc_msc *)(hdr+1); ng_btsocket_rfcomm_pcb_t *pcb = NULL; int error = 0; mtx_assert(&s->session_mtx, MA_OWNED); NG_BTSOCKET_RFCOMM_INFO( "%s: Got MCC MSC, dlci=%d, cr=%d, length=%d, session state=%d, flags=%#x, " \ "mtu=%d, len=%d\n", __func__, RFCOMM_DLCI(msc->address), RFCOMM_CR(hdr->type), RFCOMM_MCC_LENGTH(hdr->length), s->state, s->flags, s->mtu, m0->m_pkthdr.len); if (RFCOMM_CR(hdr->type)) { pcb = ng_btsocket_rfcomm_pcb_by_dlci(s, RFCOMM_DLCI(msc->address)); if (pcb == NULL) { NG_BTSOCKET_RFCOMM_WARN( "%s: Got MSC command for non-existing dlci=%d\n", __func__, RFCOMM_DLCI(msc->address)); NG_FREE_M(m0); return (ENOENT); } mtx_lock(&pcb->pcb_mtx); if (pcb->state != NG_BTSOCKET_RFCOMM_DLC_CONNECTING && pcb->state != NG_BTSOCKET_RFCOMM_DLC_CONNECTED) { NG_BTSOCKET_RFCOMM_WARN( "%s: Got MSC on dlci=%d in invalid state=%d\n", __func__, RFCOMM_DLCI(msc->address), pcb->state); mtx_unlock(&pcb->pcb_mtx); NG_FREE_M(m0); return (EINVAL); } pcb->rmodem = msc->modem; /* Update remote port signals */ hdr->type = RFCOMM_MKMCC_TYPE(0, RFCOMM_MCC_MSC); error = ng_btsocket_rfcomm_send_uih(s, RFCOMM_MKADDRESS(INITIATOR(s), 0), 0, 0, m0); #if 0 /* YYY */ /* Send more data from DLC. XXX check for errors? */ if (!(pcb->rmodem & RFCOMM_MODEM_FC) && !(pcb->flags & NG_BTSOCKET_RFCOMM_DLC_CFC)) ng_btsocket_rfcomm_pcb_send(pcb, ALOT); #endif /* YYY */ mtx_unlock(&pcb->pcb_mtx); } else NG_FREE_M(m0); /* XXX ignore response */ return (error); } /* ng_btsocket_rfcomm_receive_msc */ /* * Receive RFCOMM RPN MCC command * XXX FIXME do we need htole16/le16toh for RPN param_mask? */ static int ng_btsocket_rfcomm_receive_rpn(ng_btsocket_rfcomm_session_p s, struct mbuf *m0) { struct rfcomm_mcc_hdr *hdr = mtod(m0, struct rfcomm_mcc_hdr *); struct rfcomm_mcc_rpn *rpn = (struct rfcomm_mcc_rpn *)(hdr + 1); int error = 0; u_int16_t param_mask; u_int8_t bit_rate, data_bits, stop_bits, parity, flow_control, xon_char, xoff_char; mtx_assert(&s->session_mtx, MA_OWNED); NG_BTSOCKET_RFCOMM_INFO( "%s: Got MCC RPN, dlci=%d, cr=%d, length=%d, session state=%d, flags=%#x, " \ "mtu=%d, len=%d\n", __func__, RFCOMM_DLCI(rpn->dlci), RFCOMM_CR(hdr->type), RFCOMM_MCC_LENGTH(hdr->length), s->state, s->flags, s->mtu, m0->m_pkthdr.len); if (RFCOMM_CR(hdr->type)) { param_mask = RFCOMM_RPN_PM_ALL; if (RFCOMM_MCC_LENGTH(hdr->length) == 1) { /* Request - return default setting */ bit_rate = RFCOMM_RPN_BR_115200; data_bits = RFCOMM_RPN_DATA_8; stop_bits = RFCOMM_RPN_STOP_1; parity = RFCOMM_RPN_PARITY_NONE; flow_control = RFCOMM_RPN_FLOW_NONE; xon_char = RFCOMM_RPN_XON_CHAR; xoff_char = RFCOMM_RPN_XOFF_CHAR; } else { /* * Ignore/accept bit_rate, 8 bits, 1 stop bit, no * parity, no flow control lines, default XON/XOFF * chars. */ bit_rate = rpn->bit_rate; rpn->param_mask = le16toh(rpn->param_mask); /* XXX */ data_bits = RFCOMM_RPN_DATA_BITS(rpn->line_settings); if (rpn->param_mask & RFCOMM_RPN_PM_DATA && data_bits != RFCOMM_RPN_DATA_8) { data_bits = RFCOMM_RPN_DATA_8; param_mask ^= RFCOMM_RPN_PM_DATA; } stop_bits = RFCOMM_RPN_STOP_BITS(rpn->line_settings); if (rpn->param_mask & RFCOMM_RPN_PM_STOP && stop_bits != RFCOMM_RPN_STOP_1) { stop_bits = RFCOMM_RPN_STOP_1; param_mask ^= RFCOMM_RPN_PM_STOP; } parity = RFCOMM_RPN_PARITY(rpn->line_settings); if (rpn->param_mask & RFCOMM_RPN_PM_PARITY && parity != RFCOMM_RPN_PARITY_NONE) { parity = RFCOMM_RPN_PARITY_NONE; param_mask ^= RFCOMM_RPN_PM_PARITY; } flow_control = rpn->flow_control; if (rpn->param_mask & RFCOMM_RPN_PM_FLOW && flow_control != RFCOMM_RPN_FLOW_NONE) { flow_control = RFCOMM_RPN_FLOW_NONE; param_mask ^= RFCOMM_RPN_PM_FLOW; } xon_char = rpn->xon_char; if (rpn->param_mask & RFCOMM_RPN_PM_XON && xon_char != RFCOMM_RPN_XON_CHAR) { xon_char = RFCOMM_RPN_XON_CHAR; param_mask ^= RFCOMM_RPN_PM_XON; } xoff_char = rpn->xoff_char; if (rpn->param_mask & RFCOMM_RPN_PM_XOFF && xoff_char != RFCOMM_RPN_XOFF_CHAR) { xoff_char = RFCOMM_RPN_XOFF_CHAR; param_mask ^= RFCOMM_RPN_PM_XOFF; } } rpn->bit_rate = bit_rate; rpn->line_settings = RFCOMM_MKRPN_LINE_SETTINGS(data_bits, stop_bits, parity); rpn->flow_control = flow_control; rpn->xon_char = xon_char; rpn->xoff_char = xoff_char; rpn->param_mask = htole16(param_mask); /* XXX */ m0->m_pkthdr.len = m0->m_len = sizeof(*hdr) + sizeof(*rpn); hdr->type = RFCOMM_MKMCC_TYPE(0, RFCOMM_MCC_RPN); error = ng_btsocket_rfcomm_send_uih(s, RFCOMM_MKADDRESS(INITIATOR(s), 0), 0, 0, m0); } else NG_FREE_M(m0); /* XXX ignore response */ return (error); } /* ng_btsocket_rfcomm_receive_rpn */ /* * Receive RFCOMM RLS MCC command */ static int ng_btsocket_rfcomm_receive_rls(ng_btsocket_rfcomm_session_p s, struct mbuf *m0) { struct rfcomm_mcc_hdr *hdr = mtod(m0, struct rfcomm_mcc_hdr *); struct rfcomm_mcc_rls *rls = (struct rfcomm_mcc_rls *)(hdr + 1); int error = 0; mtx_assert(&s->session_mtx, MA_OWNED); /* * XXX FIXME Do we have to do anything else here? Remote peer tries to * tell us something about DLCI. Just report what we have received and * return back received values as required by TS 07.10 spec. */ NG_BTSOCKET_RFCOMM_INFO( "%s: Got MCC RLS, dlci=%d, status=%#x, cr=%d, length=%d, session state=%d, " \ "flags=%#x, mtu=%d, len=%d\n", __func__, RFCOMM_DLCI(rls->address), rls->status, RFCOMM_CR(hdr->type), RFCOMM_MCC_LENGTH(hdr->length), s->state, s->flags, s->mtu, m0->m_pkthdr.len); if (RFCOMM_CR(hdr->type)) { if (rls->status & 0x1) NG_BTSOCKET_RFCOMM_ERR( "%s: Got RLS dlci=%d, error=%#x\n", __func__, RFCOMM_DLCI(rls->address), rls->status >> 1); hdr->type = RFCOMM_MKMCC_TYPE(0, RFCOMM_MCC_RLS); error = ng_btsocket_rfcomm_send_uih(s, RFCOMM_MKADDRESS(INITIATOR(s), 0), 0, 0, m0); } else NG_FREE_M(m0); /* XXX ignore responses */ return (error); } /* ng_btsocket_rfcomm_receive_rls */ /* * Receive RFCOMM PN MCC command */ static int ng_btsocket_rfcomm_receive_pn(ng_btsocket_rfcomm_session_p s, struct mbuf *m0) { struct rfcomm_mcc_hdr *hdr = mtod(m0, struct rfcomm_mcc_hdr*); struct rfcomm_mcc_pn *pn = (struct rfcomm_mcc_pn *)(hdr+1); ng_btsocket_rfcomm_pcb_t *pcb = NULL; int error = 0; mtx_assert(&s->session_mtx, MA_OWNED); NG_BTSOCKET_RFCOMM_INFO( "%s: Got MCC PN, dlci=%d, cr=%d, length=%d, flow_control=%#x, priority=%d, " \ "ack_timer=%d, mtu=%d, max_retrans=%d, credits=%d, session state=%d, " \ "flags=%#x, session mtu=%d, len=%d\n", __func__, pn->dlci, RFCOMM_CR(hdr->type), RFCOMM_MCC_LENGTH(hdr->length), pn->flow_control, pn->priority, pn->ack_timer, le16toh(pn->mtu), pn->max_retrans, pn->credits, s->state, s->flags, s->mtu, m0->m_pkthdr.len); if (pn->dlci == 0) { NG_BTSOCKET_RFCOMM_ERR("%s: Zero dlci in MCC PN\n", __func__); NG_FREE_M(m0); return (EINVAL); } /* Check if we have this dlci */ pcb = ng_btsocket_rfcomm_pcb_by_dlci(s, pn->dlci); if (pcb != NULL) { mtx_lock(&pcb->pcb_mtx); if (RFCOMM_CR(hdr->type)) { /* PN Request */ ng_btsocket_rfcomm_set_pn(pcb, 1, pn->flow_control, pn->credits, pn->mtu); if (pcb->flags & NG_BTSOCKET_RFCOMM_DLC_CFC) { pn->flow_control = 0xe0; pn->credits = RFCOMM_DEFAULT_CREDITS; } else { pn->flow_control = 0; pn->credits = 0; } hdr->type = RFCOMM_MKMCC_TYPE(0, RFCOMM_MCC_PN); error = ng_btsocket_rfcomm_send_uih(s, RFCOMM_MKADDRESS(INITIATOR(s), 0), 0, 0, m0); } else { /* PN Response - proceed with SABM. Timeout still set */ if (pcb->state == NG_BTSOCKET_RFCOMM_DLC_CONFIGURING) { ng_btsocket_rfcomm_set_pn(pcb, 0, pn->flow_control, pn->credits, pn->mtu); pcb->state = NG_BTSOCKET_RFCOMM_DLC_CONNECTING; error = ng_btsocket_rfcomm_send_command(s, RFCOMM_FRAME_SABM, pn->dlci); } else NG_BTSOCKET_RFCOMM_WARN( "%s: Got PN response for dlci=%d in invalid state=%d\n", __func__, pn->dlci, pcb->state); NG_FREE_M(m0); } mtx_unlock(&pcb->pcb_mtx); } else if (RFCOMM_CR(hdr->type)) { /* PN request to non-existing dlci - incoming connection */ pcb = ng_btsocket_rfcomm_connect_ind(s, RFCOMM_SRVCHANNEL(pn->dlci)); if (pcb != NULL) { mtx_lock(&pcb->pcb_mtx); pcb->dlci = pn->dlci; ng_btsocket_rfcomm_set_pn(pcb, 1, pn->flow_control, pn->credits, pn->mtu); if (pcb->flags & NG_BTSOCKET_RFCOMM_DLC_CFC) { pn->flow_control = 0xe0; pn->credits = RFCOMM_DEFAULT_CREDITS; } else { pn->flow_control = 0; pn->credits = 0; } hdr->type = RFCOMM_MKMCC_TYPE(0, RFCOMM_MCC_PN); error = ng_btsocket_rfcomm_send_uih(s, RFCOMM_MKADDRESS(INITIATOR(s), 0), 0, 0, m0); if (error == 0) { ng_btsocket_rfcomm_timeout(pcb); pcb->state = NG_BTSOCKET_RFCOMM_DLC_CONNECTING; soisconnecting(pcb->so); } else ng_btsocket_rfcomm_pcb_kill(pcb, error); mtx_unlock(&pcb->pcb_mtx); } else { /* Nobody is listen()ing on this channel */ error = ng_btsocket_rfcomm_send_command(s, RFCOMM_FRAME_DM, pn->dlci); NG_FREE_M(m0); } } else NG_FREE_M(m0); /* XXX ignore response to non-existing dlci */ return (error); } /* ng_btsocket_rfcomm_receive_pn */ /* * Set PN parameters for dlci. Caller must hold pcb->pcb_mtx. * * From Bluetooth spec. * * "... The CL1 - CL4 field is completely redefined. (In TS07.10 this defines * the convergence layer to use, which is not applicable to RFCOMM. In RFCOMM, * in Bluetooth versions up to 1.0B, this field was forced to 0). * * In the PN request sent prior to a DLC establishment, this field must contain * the value 15 (0xF), indicating support of credit based flow control in the * sender. See Table 5.3 below. If the PN response contains any other value * than 14 (0xE) in this field, it is inferred that the peer RFCOMM entity is * not supporting the credit based flow control feature. (This is only possible * if the peer RFCOMM implementation is only conforming to Bluetooth version * 1.0B.) If a PN request is sent on an already open DLC, then this field must * contain the value zero; it is not possible to set initial credits more * than once per DLC activation. A responding implementation must set this * field in the PN response to 14 (0xE), if (and only if) the value in the PN * request was 15..." */ static void ng_btsocket_rfcomm_set_pn(ng_btsocket_rfcomm_pcb_p pcb, u_int8_t cr, u_int8_t flow_control, u_int8_t credits, u_int16_t mtu) { mtx_assert(&pcb->pcb_mtx, MA_OWNED); pcb->mtu = le16toh(mtu); if (cr) { if (flow_control == 0xf0) { pcb->flags |= NG_BTSOCKET_RFCOMM_DLC_CFC; pcb->tx_cred = credits; } else { pcb->flags &= ~NG_BTSOCKET_RFCOMM_DLC_CFC; pcb->tx_cred = 0; } } else { if (flow_control == 0xe0) { pcb->flags |= NG_BTSOCKET_RFCOMM_DLC_CFC; pcb->tx_cred = credits; } else { pcb->flags &= ~NG_BTSOCKET_RFCOMM_DLC_CFC; pcb->tx_cred = 0; } } NG_BTSOCKET_RFCOMM_INFO( "%s: cr=%d, dlci=%d, state=%d, flags=%#x, mtu=%d, rx_cred=%d, tx_cred=%d\n", __func__, cr, pcb->dlci, pcb->state, pcb->flags, pcb->mtu, pcb->rx_cred, pcb->tx_cred); } /* ng_btsocket_rfcomm_set_pn */ /* * Send RFCOMM SABM/DISC/UA/DM frames. Caller must hold s->session_mtx */ static int ng_btsocket_rfcomm_send_command(ng_btsocket_rfcomm_session_p s, u_int8_t type, u_int8_t dlci) { struct rfcomm_cmd_hdr *hdr = NULL; struct mbuf *m = NULL; int cr; mtx_assert(&s->session_mtx, MA_OWNED); NG_BTSOCKET_RFCOMM_INFO( "%s: Sending command type %#x, session state=%d, flags=%#x, mtu=%d, dlci=%d\n", __func__, type, s->state, s->flags, s->mtu, dlci); switch (type) { case RFCOMM_FRAME_SABM: case RFCOMM_FRAME_DISC: cr = INITIATOR(s); break; case RFCOMM_FRAME_UA: case RFCOMM_FRAME_DM: cr = !INITIATOR(s); break; default: panic("%s: Invalid frame type=%#x\n", __func__, type); return (EINVAL); /* NOT REACHED */ } MGETHDR(m, M_NOWAIT, MT_DATA); if (m == NULL) return (ENOBUFS); m->m_pkthdr.len = m->m_len = sizeof(*hdr); hdr = mtod(m, struct rfcomm_cmd_hdr *); hdr->address = RFCOMM_MKADDRESS(cr, dlci); hdr->control = RFCOMM_MKCONTROL(type, 1); hdr->length = RFCOMM_MKLEN8(0); hdr->fcs = ng_btsocket_rfcomm_fcs3((u_int8_t *) hdr); NG_BT_MBUFQ_ENQUEUE(&s->outq, m); return (0); } /* ng_btsocket_rfcomm_send_command */ /* * Send RFCOMM UIH frame. Caller must hold s->session_mtx */ static int ng_btsocket_rfcomm_send_uih(ng_btsocket_rfcomm_session_p s, u_int8_t address, u_int8_t pf, u_int8_t credits, struct mbuf *data) { struct rfcomm_frame_hdr *hdr = NULL; struct mbuf *m = NULL, *mcrc = NULL; u_int16_t length; mtx_assert(&s->session_mtx, MA_OWNED); MGETHDR(m, M_NOWAIT, MT_DATA); if (m == NULL) { NG_FREE_M(data); return (ENOBUFS); } m->m_pkthdr.len = m->m_len = sizeof(*hdr); MGET(mcrc, M_NOWAIT, MT_DATA); if (mcrc == NULL) { NG_FREE_M(data); return (ENOBUFS); } mcrc->m_len = 1; /* Fill UIH frame header */ hdr = mtod(m, struct rfcomm_frame_hdr *); hdr->address = address; hdr->control = RFCOMM_MKCONTROL(RFCOMM_FRAME_UIH, pf); /* Calculate FCS */ mcrc->m_data[0] = ng_btsocket_rfcomm_fcs2((u_int8_t *) hdr); /* Put length back */ length = (data != NULL)? data->m_pkthdr.len : 0; if (length > 127) { u_int16_t l = htole16(RFCOMM_MKLEN16(length)); bcopy(&l, &hdr->length, sizeof(l)); m->m_pkthdr.len ++; m->m_len ++; } else hdr->length = RFCOMM_MKLEN8(length); if (pf) { m->m_data[m->m_len] = credits; m->m_pkthdr.len ++; m->m_len ++; } /* Add payload */ if (data != NULL) { m_cat(m, data); m->m_pkthdr.len += length; } /* Put FCS back */ m_cat(m, mcrc); m->m_pkthdr.len ++; NG_BTSOCKET_RFCOMM_INFO( "%s: Sending UIH state=%d, flags=%#x, address=%d, length=%d, pf=%d, " \ "credits=%d, len=%d\n", __func__, s->state, s->flags, address, length, pf, credits, m->m_pkthdr.len); NG_BT_MBUFQ_ENQUEUE(&s->outq, m); return (0); } /* ng_btsocket_rfcomm_send_uih */ /* * Send MSC request. Caller must hold pcb->pcb_mtx and pcb->session->session_mtx */ static int ng_btsocket_rfcomm_send_msc(ng_btsocket_rfcomm_pcb_p pcb) { struct mbuf *m = NULL; struct rfcomm_mcc_hdr *hdr = NULL; struct rfcomm_mcc_msc *msc = NULL; mtx_assert(&pcb->session->session_mtx, MA_OWNED); mtx_assert(&pcb->pcb_mtx, MA_OWNED); MGETHDR(m, M_NOWAIT, MT_DATA); if (m == NULL) return (ENOBUFS); m->m_pkthdr.len = m->m_len = sizeof(*hdr) + sizeof(*msc); hdr = mtod(m, struct rfcomm_mcc_hdr *); msc = (struct rfcomm_mcc_msc *)(hdr + 1); hdr->type = RFCOMM_MKMCC_TYPE(1, RFCOMM_MCC_MSC); hdr->length = RFCOMM_MKLEN8(sizeof(*msc)); msc->address = RFCOMM_MKADDRESS(1, pcb->dlci); msc->modem = pcb->lmodem; NG_BTSOCKET_RFCOMM_INFO( "%s: Sending MSC dlci=%d, state=%d, flags=%#x, address=%d, modem=%#x\n", __func__, pcb->dlci, pcb->state, pcb->flags, msc->address, msc->modem); return (ng_btsocket_rfcomm_send_uih(pcb->session, RFCOMM_MKADDRESS(INITIATOR(pcb->session), 0), 0, 0, m)); } /* ng_btsocket_rfcomm_send_msc */ /* * Send PN request. Caller must hold pcb->pcb_mtx and pcb->session->session_mtx */ static int ng_btsocket_rfcomm_send_pn(ng_btsocket_rfcomm_pcb_p pcb) { struct mbuf *m = NULL; struct rfcomm_mcc_hdr *hdr = NULL; struct rfcomm_mcc_pn *pn = NULL; mtx_assert(&pcb->session->session_mtx, MA_OWNED); mtx_assert(&pcb->pcb_mtx, MA_OWNED); MGETHDR(m, M_NOWAIT, MT_DATA); if (m == NULL) return (ENOBUFS); m->m_pkthdr.len = m->m_len = sizeof(*hdr) + sizeof(*pn); hdr = mtod(m, struct rfcomm_mcc_hdr *); pn = (struct rfcomm_mcc_pn *)(hdr + 1); hdr->type = RFCOMM_MKMCC_TYPE(1, RFCOMM_MCC_PN); hdr->length = RFCOMM_MKLEN8(sizeof(*pn)); pn->dlci = pcb->dlci; /* * Set default DLCI priority as described in GSM 07.10 * (ETSI TS 101 369) clause 5.6 page 42 */ pn->priority = (pcb->dlci < 56)? (((pcb->dlci >> 3) << 3) + 7) : 61; pn->ack_timer = 0; pn->mtu = htole16(pcb->mtu); pn->max_retrans = 0; if (pcb->flags & NG_BTSOCKET_RFCOMM_DLC_CFC) { pn->flow_control = 0xf0; pn->credits = pcb->rx_cred; } else { pn->flow_control = 0; pn->credits = 0; } NG_BTSOCKET_RFCOMM_INFO( "%s: Sending PN dlci=%d, state=%d, flags=%#x, mtu=%d, flow_control=%#x, " \ "credits=%d\n", __func__, pcb->dlci, pcb->state, pcb->flags, pcb->mtu, pn->flow_control, pn->credits); return (ng_btsocket_rfcomm_send_uih(pcb->session, RFCOMM_MKADDRESS(INITIATOR(pcb->session), 0), 0, 0, m)); } /* ng_btsocket_rfcomm_send_pn */ /* * Calculate and send credits based on available space in receive buffer */ static int ng_btsocket_rfcomm_send_credits(ng_btsocket_rfcomm_pcb_p pcb) { int error = 0; u_int8_t credits; mtx_assert(&pcb->pcb_mtx, MA_OWNED); mtx_assert(&pcb->session->session_mtx, MA_OWNED); NG_BTSOCKET_RFCOMM_INFO( "%s: Sending more credits, dlci=%d, state=%d, flags=%#x, mtu=%d, " \ "space=%ld, tx_cred=%d, rx_cred=%d\n", __func__, pcb->dlci, pcb->state, pcb->flags, pcb->mtu, sbspace(&pcb->so->so_rcv), pcb->tx_cred, pcb->rx_cred); credits = sbspace(&pcb->so->so_rcv) / pcb->mtu; if (credits > 0) { if (pcb->rx_cred + credits > RFCOMM_MAX_CREDITS) credits = RFCOMM_MAX_CREDITS - pcb->rx_cred; error = ng_btsocket_rfcomm_send_uih( pcb->session, RFCOMM_MKADDRESS(INITIATOR(pcb->session), pcb->dlci), 1, credits, NULL); if (error == 0) { pcb->rx_cred += credits; NG_BTSOCKET_RFCOMM_INFO( "%s: Gave remote side %d more credits, dlci=%d, state=%d, flags=%#x, " \ "rx_cred=%d, tx_cred=%d\n", __func__, credits, pcb->dlci, pcb->state, pcb->flags, pcb->rx_cred, pcb->tx_cred); } else NG_BTSOCKET_RFCOMM_ERR( "%s: Could not send credits, error=%d, dlci=%d, state=%d, flags=%#x, " \ "mtu=%d, space=%ld, tx_cred=%d, rx_cred=%d\n", __func__, error, pcb->dlci, pcb->state, pcb->flags, pcb->mtu, sbspace(&pcb->so->so_rcv), pcb->tx_cred, pcb->rx_cred); } return (error); } /* ng_btsocket_rfcomm_send_credits */ /***************************************************************************** ***************************************************************************** ** RFCOMM DLCs ***************************************************************************** *****************************************************************************/ /* * Send data from socket send buffer * Caller must hold pcb->pcb_mtx and pcb->session->session_mtx */ static int ng_btsocket_rfcomm_pcb_send(ng_btsocket_rfcomm_pcb_p pcb, int limit) { struct mbuf *m = NULL; int sent, length, error; mtx_assert(&pcb->session->session_mtx, MA_OWNED); mtx_assert(&pcb->pcb_mtx, MA_OWNED); if (pcb->flags & NG_BTSOCKET_RFCOMM_DLC_CFC) limit = min(limit, pcb->tx_cred); else if (!(pcb->rmodem & RFCOMM_MODEM_FC)) limit = min(limit, RFCOMM_MAX_CREDITS); /* XXX ??? */ else limit = 0; if (limit == 0) { NG_BTSOCKET_RFCOMM_INFO( "%s: Could not send - remote flow control asserted, dlci=%d, flags=%#x, " \ "rmodem=%#x, tx_cred=%d\n", __func__, pcb->dlci, pcb->flags, pcb->rmodem, pcb->tx_cred); return (0); } for (error = 0, sent = 0; sent < limit; sent ++) { length = min(pcb->mtu, sbavail(&pcb->so->so_snd)); if (length == 0) break; /* Get the chunk from the socket's send buffer */ m = ng_btsocket_rfcomm_prepare_packet(&pcb->so->so_snd, length); if (m == NULL) { error = ENOBUFS; break; } sbdrop(&pcb->so->so_snd, length); error = ng_btsocket_rfcomm_send_uih(pcb->session, RFCOMM_MKADDRESS(INITIATOR(pcb->session), pcb->dlci), 0, 0, m); if (error != 0) break; } if (pcb->flags & NG_BTSOCKET_RFCOMM_DLC_CFC) pcb->tx_cred -= sent; if (error == 0 && sent > 0) { pcb->flags &= ~NG_BTSOCKET_RFCOMM_DLC_SENDING; sowwakeup(pcb->so); } return (error); } /* ng_btsocket_rfcomm_pcb_send */ /* * Unlink and disconnect DLC. If ng_btsocket_rfcomm_pcb_kill() returns * non zero value than socket has no reference and has to be detached. * Caller must hold pcb->pcb_mtx and pcb->session->session_mtx */ static void ng_btsocket_rfcomm_pcb_kill(ng_btsocket_rfcomm_pcb_p pcb, int error) { ng_btsocket_rfcomm_session_p s = pcb->session; NG_BTSOCKET_RFCOMM_INFO( "%s: Killing DLC, so=%p, dlci=%d, state=%d, flags=%#x, error=%d\n", __func__, pcb->so, pcb->dlci, pcb->state, pcb->flags, error); if (pcb->session == NULL) panic("%s: DLC without session, pcb=%p, state=%d, flags=%#x\n", __func__, pcb, pcb->state, pcb->flags); mtx_assert(&pcb->session->session_mtx, MA_OWNED); mtx_assert(&pcb->pcb_mtx, MA_OWNED); if (pcb->flags & NG_BTSOCKET_RFCOMM_DLC_TIMO) ng_btsocket_rfcomm_untimeout(pcb); /* Detach DLC from the session. Does not matter which state DLC in */ LIST_REMOVE(pcb, session_next); pcb->session = NULL; /* Change DLC state and wakeup all sleepers */ pcb->state = NG_BTSOCKET_RFCOMM_DLC_CLOSED; pcb->so->so_error = error; soisdisconnected(pcb->so); wakeup(&pcb->state); /* Check if we have any DLCs left on the session */ if (LIST_EMPTY(&s->dlcs) && INITIATOR(s)) { NG_BTSOCKET_RFCOMM_INFO( "%s: Disconnecting session, state=%d, flags=%#x, mtu=%d\n", __func__, s->state, s->flags, s->mtu); switch (s->state) { case NG_BTSOCKET_RFCOMM_SESSION_CLOSED: case NG_BTSOCKET_RFCOMM_SESSION_DISCONNECTING: /* * Do not have to do anything here. We can get here * when L2CAP connection was terminated or we have * received DISC on multiplexor channel */ break; case NG_BTSOCKET_RFCOMM_SESSION_OPEN: /* Send DISC on multiplexor channel */ error = ng_btsocket_rfcomm_send_command(s, RFCOMM_FRAME_DISC, 0); if (error == 0) { s->state = NG_BTSOCKET_RFCOMM_SESSION_DISCONNECTING; break; } /* FALL THROUGH */ case NG_BTSOCKET_RFCOMM_SESSION_CONNECTING: case NG_BTSOCKET_RFCOMM_SESSION_CONNECTED: s->state = NG_BTSOCKET_RFCOMM_SESSION_CLOSED; break; /* case NG_BTSOCKET_RFCOMM_SESSION_LISTENING: */ default: panic("%s: Invalid session state=%d, flags=%#x\n", __func__, s->state, s->flags); break; } ng_btsocket_rfcomm_task_wakeup(); } } /* ng_btsocket_rfcomm_pcb_kill */ /* * Look for given dlci for given RFCOMM session. Caller must hold s->session_mtx */ static ng_btsocket_rfcomm_pcb_p ng_btsocket_rfcomm_pcb_by_dlci(ng_btsocket_rfcomm_session_p s, int dlci) { ng_btsocket_rfcomm_pcb_p pcb = NULL; mtx_assert(&s->session_mtx, MA_OWNED); LIST_FOREACH(pcb, &s->dlcs, session_next) if (pcb->dlci == dlci) break; return (pcb); } /* ng_btsocket_rfcomm_pcb_by_dlci */ /* * Look for socket that listens on given src address and given channel */ static ng_btsocket_rfcomm_pcb_p ng_btsocket_rfcomm_pcb_listener(bdaddr_p src, int channel) { ng_btsocket_rfcomm_pcb_p pcb = NULL, pcb1 = NULL; mtx_lock(&ng_btsocket_rfcomm_sockets_mtx); LIST_FOREACH(pcb, &ng_btsocket_rfcomm_sockets, next) { - if (pcb->channel != channel || - !(pcb->so->so_options & SO_ACCEPTCONN)) + if (pcb->channel != channel || !SOLISTENING(pcb->so)) continue; if (bcmp(&pcb->src, src, sizeof(*src)) == 0) break; if (bcmp(&pcb->src, NG_HCI_BDADDR_ANY, sizeof(bdaddr_t)) == 0) pcb1 = pcb; } mtx_unlock(&ng_btsocket_rfcomm_sockets_mtx); return ((pcb != NULL)? pcb : pcb1); } /* ng_btsocket_rfcomm_pcb_listener */ /***************************************************************************** ***************************************************************************** ** Misc. functions ***************************************************************************** *****************************************************************************/ /* * Set timeout. Caller MUST hold pcb_mtx */ static void ng_btsocket_rfcomm_timeout(ng_btsocket_rfcomm_pcb_p pcb) { mtx_assert(&pcb->pcb_mtx, MA_OWNED); if (!(pcb->flags & NG_BTSOCKET_RFCOMM_DLC_TIMO)) { pcb->flags |= NG_BTSOCKET_RFCOMM_DLC_TIMO; pcb->flags &= ~NG_BTSOCKET_RFCOMM_DLC_TIMEDOUT; callout_reset(&pcb->timo, ng_btsocket_rfcomm_timo * hz, ng_btsocket_rfcomm_process_timeout, pcb); } else panic("%s: Duplicated socket timeout?!\n", __func__); } /* ng_btsocket_rfcomm_timeout */ /* * Unset pcb timeout. Caller MUST hold pcb_mtx */ static void ng_btsocket_rfcomm_untimeout(ng_btsocket_rfcomm_pcb_p pcb) { mtx_assert(&pcb->pcb_mtx, MA_OWNED); if (pcb->flags & NG_BTSOCKET_RFCOMM_DLC_TIMO) { callout_stop(&pcb->timo); pcb->flags &= ~NG_BTSOCKET_RFCOMM_DLC_TIMO; pcb->flags &= ~NG_BTSOCKET_RFCOMM_DLC_TIMEDOUT; } else panic("%s: No socket timeout?!\n", __func__); } /* ng_btsocket_rfcomm_timeout */ /* * Process pcb timeout */ static void ng_btsocket_rfcomm_process_timeout(void *xpcb) { ng_btsocket_rfcomm_pcb_p pcb = (ng_btsocket_rfcomm_pcb_p) xpcb; mtx_assert(&pcb->pcb_mtx, MA_OWNED); NG_BTSOCKET_RFCOMM_INFO( "%s: Timeout, so=%p, dlci=%d, state=%d, flags=%#x\n", __func__, pcb->so, pcb->dlci, pcb->state, pcb->flags); pcb->flags &= ~NG_BTSOCKET_RFCOMM_DLC_TIMO; pcb->flags |= NG_BTSOCKET_RFCOMM_DLC_TIMEDOUT; switch (pcb->state) { case NG_BTSOCKET_RFCOMM_DLC_CONFIGURING: case NG_BTSOCKET_RFCOMM_DLC_CONNECTING: pcb->state = NG_BTSOCKET_RFCOMM_DLC_DISCONNECTING; break; case NG_BTSOCKET_RFCOMM_DLC_W4_CONNECT: case NG_BTSOCKET_RFCOMM_DLC_DISCONNECTING: break; default: panic( "%s: DLC timeout in invalid state, dlci=%d, state=%d, flags=%#x\n", __func__, pcb->dlci, pcb->state, pcb->flags); break; } ng_btsocket_rfcomm_task_wakeup(); } /* ng_btsocket_rfcomm_process_timeout */ /* * Get up to length bytes from the socket buffer */ static struct mbuf * ng_btsocket_rfcomm_prepare_packet(struct sockbuf *sb, int length) { struct mbuf *top = NULL, *m = NULL, *n = NULL, *nextpkt = NULL; int mlen, noff, len; MGETHDR(top, M_NOWAIT, MT_DATA); if (top == NULL) return (NULL); top->m_pkthdr.len = length; top->m_len = 0; mlen = MHLEN; m = top; n = sb->sb_mb; nextpkt = n->m_nextpkt; noff = 0; while (length > 0 && n != NULL) { len = min(mlen - m->m_len, n->m_len - noff); if (len > length) len = length; bcopy(mtod(n, caddr_t)+noff, mtod(m, caddr_t)+m->m_len, len); m->m_len += len; noff += len; length -= len; if (length > 0 && m->m_len == mlen) { MGET(m->m_next, M_NOWAIT, MT_DATA); if (m->m_next == NULL) { NG_FREE_M(top); return (NULL); } m = m->m_next; m->m_len = 0; mlen = MLEN; } if (noff == n->m_len) { noff = 0; n = n->m_next; if (n == NULL) n = nextpkt; nextpkt = (n != NULL)? n->m_nextpkt : NULL; } } if (length < 0) panic("%s: length=%d\n", __func__, length); if (length > 0 && n == NULL) panic("%s: bogus length=%d, n=%p\n", __func__, length, n); return (top); } /* ng_btsocket_rfcomm_prepare_packet */ diff --git a/sys/netgraph/bluetooth/socket/ng_btsocket_sco.c b/sys/netgraph/bluetooth/socket/ng_btsocket_sco.c index da8f22befc79..068b1890f27f 100644 --- a/sys/netgraph/bluetooth/socket/ng_btsocket_sco.c +++ b/sys/netgraph/bluetooth/socket/ng_btsocket_sco.c @@ -1,1981 +1,1981 @@ /* * ng_btsocket_sco.c */ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2001-2002 Maksim Yevmenkin * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $Id: ng_btsocket_sco.c,v 1.2 2005/10/31 18:08:51 max Exp $ * $FreeBSD$ */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* MALLOC define */ #ifdef NG_SEPARATE_MALLOC static MALLOC_DEFINE(M_NETGRAPH_BTSOCKET_SCO, "netgraph_btsocks_sco", "Netgraph Bluetooth SCO sockets"); #else #define M_NETGRAPH_BTSOCKET_SCO M_NETGRAPH #endif /* NG_SEPARATE_MALLOC */ /* Netgraph node methods */ static ng_constructor_t ng_btsocket_sco_node_constructor; static ng_rcvmsg_t ng_btsocket_sco_node_rcvmsg; static ng_shutdown_t ng_btsocket_sco_node_shutdown; static ng_newhook_t ng_btsocket_sco_node_newhook; static ng_connect_t ng_btsocket_sco_node_connect; static ng_rcvdata_t ng_btsocket_sco_node_rcvdata; static ng_disconnect_t ng_btsocket_sco_node_disconnect; static void ng_btsocket_sco_input (void *, int); static void ng_btsocket_sco_rtclean (void *, int); /* Netgraph type descriptor */ static struct ng_type typestruct = { .version = NG_ABI_VERSION, .name = NG_BTSOCKET_SCO_NODE_TYPE, .constructor = ng_btsocket_sco_node_constructor, .rcvmsg = ng_btsocket_sco_node_rcvmsg, .shutdown = ng_btsocket_sco_node_shutdown, .newhook = ng_btsocket_sco_node_newhook, .connect = ng_btsocket_sco_node_connect, .rcvdata = ng_btsocket_sco_node_rcvdata, .disconnect = ng_btsocket_sco_node_disconnect, }; /* Globals */ static u_int32_t ng_btsocket_sco_debug_level; static node_p ng_btsocket_sco_node; static struct ng_bt_itemq ng_btsocket_sco_queue; static struct mtx ng_btsocket_sco_queue_mtx; static struct task ng_btsocket_sco_queue_task; static struct mtx ng_btsocket_sco_sockets_mtx; static LIST_HEAD(, ng_btsocket_sco_pcb) ng_btsocket_sco_sockets; static LIST_HEAD(, ng_btsocket_sco_rtentry) ng_btsocket_sco_rt; static struct mtx ng_btsocket_sco_rt_mtx; static struct task ng_btsocket_sco_rt_task; static struct timeval ng_btsocket_sco_lasttime; static int ng_btsocket_sco_curpps; /* Sysctl tree */ SYSCTL_DECL(_net_bluetooth_sco_sockets); static SYSCTL_NODE(_net_bluetooth_sco_sockets, OID_AUTO, seq, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Bluetooth SEQPACKET SCO sockets family"); SYSCTL_UINT(_net_bluetooth_sco_sockets_seq, OID_AUTO, debug_level, CTLFLAG_RW, &ng_btsocket_sco_debug_level, NG_BTSOCKET_WARN_LEVEL, "Bluetooth SEQPACKET SCO sockets debug level"); SYSCTL_UINT(_net_bluetooth_sco_sockets_seq, OID_AUTO, queue_len, CTLFLAG_RD, &ng_btsocket_sco_queue.len, 0, "Bluetooth SEQPACKET SCO sockets input queue length"); SYSCTL_UINT(_net_bluetooth_sco_sockets_seq, OID_AUTO, queue_maxlen, CTLFLAG_RD, &ng_btsocket_sco_queue.maxlen, 0, "Bluetooth SEQPACKET SCO sockets input queue max. length"); SYSCTL_UINT(_net_bluetooth_sco_sockets_seq, OID_AUTO, queue_drops, CTLFLAG_RD, &ng_btsocket_sco_queue.drops, 0, "Bluetooth SEQPACKET SCO sockets input queue drops"); /* Debug */ #define NG_BTSOCKET_SCO_INFO \ if (ng_btsocket_sco_debug_level >= NG_BTSOCKET_INFO_LEVEL && \ ppsratecheck(&ng_btsocket_sco_lasttime, &ng_btsocket_sco_curpps, 1)) \ printf #define NG_BTSOCKET_SCO_WARN \ if (ng_btsocket_sco_debug_level >= NG_BTSOCKET_WARN_LEVEL && \ ppsratecheck(&ng_btsocket_sco_lasttime, &ng_btsocket_sco_curpps, 1)) \ printf #define NG_BTSOCKET_SCO_ERR \ if (ng_btsocket_sco_debug_level >= NG_BTSOCKET_ERR_LEVEL && \ ppsratecheck(&ng_btsocket_sco_lasttime, &ng_btsocket_sco_curpps, 1)) \ printf #define NG_BTSOCKET_SCO_ALERT \ if (ng_btsocket_sco_debug_level >= NG_BTSOCKET_ALERT_LEVEL && \ ppsratecheck(&ng_btsocket_sco_lasttime, &ng_btsocket_sco_curpps, 1)) \ printf /* * Netgraph message processing routines */ static int ng_btsocket_sco_process_lp_con_cfm (struct ng_mesg *, ng_btsocket_sco_rtentry_p); static int ng_btsocket_sco_process_lp_con_ind (struct ng_mesg *, ng_btsocket_sco_rtentry_p); static int ng_btsocket_sco_process_lp_discon_ind (struct ng_mesg *, ng_btsocket_sco_rtentry_p); /* * Send LP messages to the lower layer */ static int ng_btsocket_sco_send_lp_con_req (ng_btsocket_sco_pcb_p); static int ng_btsocket_sco_send_lp_con_rsp (ng_btsocket_sco_rtentry_p, bdaddr_p, int); static int ng_btsocket_sco_send_lp_discon_req (ng_btsocket_sco_pcb_p); static int ng_btsocket_sco_send2 (ng_btsocket_sco_pcb_p); /* * Timeout processing routines */ static void ng_btsocket_sco_timeout (ng_btsocket_sco_pcb_p); static void ng_btsocket_sco_untimeout (ng_btsocket_sco_pcb_p); static void ng_btsocket_sco_process_timeout (void *); /* * Other stuff */ static ng_btsocket_sco_pcb_p ng_btsocket_sco_pcb_by_addr(bdaddr_p); static ng_btsocket_sco_pcb_p ng_btsocket_sco_pcb_by_handle(bdaddr_p, int); static ng_btsocket_sco_pcb_p ng_btsocket_sco_pcb_by_addrs(bdaddr_p, bdaddr_p); #define ng_btsocket_sco_wakeup_input_task() \ taskqueue_enqueue(taskqueue_swi, &ng_btsocket_sco_queue_task) #define ng_btsocket_sco_wakeup_route_task() \ taskqueue_enqueue(taskqueue_swi, &ng_btsocket_sco_rt_task) /***************************************************************************** ***************************************************************************** ** Netgraph node interface ***************************************************************************** *****************************************************************************/ /* * Netgraph node constructor. Do not allow to create node of this type. */ static int ng_btsocket_sco_node_constructor(node_p node) { return (EINVAL); } /* ng_btsocket_sco_node_constructor */ /* * Do local shutdown processing. Let old node go and create new fresh one. */ static int ng_btsocket_sco_node_shutdown(node_p node) { int error = 0; NG_NODE_UNREF(node); /* Create new node */ error = ng_make_node_common(&typestruct, &ng_btsocket_sco_node); if (error != 0) { NG_BTSOCKET_SCO_ALERT( "%s: Could not create Netgraph node, error=%d\n", __func__, error); ng_btsocket_sco_node = NULL; return (error); } error = ng_name_node(ng_btsocket_sco_node, NG_BTSOCKET_SCO_NODE_TYPE); if (error != 0) { NG_BTSOCKET_SCO_ALERT( "%s: Could not name Netgraph node, error=%d\n", __func__, error); NG_NODE_UNREF(ng_btsocket_sco_node); ng_btsocket_sco_node = NULL; return (error); } return (0); } /* ng_btsocket_sco_node_shutdown */ /* * We allow any hook to be connected to the node. */ static int ng_btsocket_sco_node_newhook(node_p node, hook_p hook, char const *name) { return (0); } /* ng_btsocket_sco_node_newhook */ /* * Just say "YEP, that's OK by me!" */ static int ng_btsocket_sco_node_connect(hook_p hook) { NG_HOOK_SET_PRIVATE(hook, NULL); NG_HOOK_REF(hook); /* Keep extra reference to the hook */ #if 0 NG_HOOK_FORCE_QUEUE(NG_HOOK_PEER(hook)); NG_HOOK_FORCE_QUEUE(hook); #endif return (0); } /* ng_btsocket_sco_node_connect */ /* * Hook disconnection. Schedule route cleanup task */ static int ng_btsocket_sco_node_disconnect(hook_p hook) { /* * If hook has private information than we must have this hook in * the routing table and must schedule cleaning for the routing table. * Otherwise hook was connected but we never got "hook_info" message, * so we have never added this hook to the routing table and it save * to just delete it. */ if (NG_HOOK_PRIVATE(hook) != NULL) return (ng_btsocket_sco_wakeup_route_task()); NG_HOOK_UNREF(hook); /* Remove extra reference */ return (0); } /* ng_btsocket_sco_node_disconnect */ /* * Process incoming messages */ static int ng_btsocket_sco_node_rcvmsg(node_p node, item_p item, hook_p hook) { struct ng_mesg *msg = NGI_MSG(item); /* item still has message */ int error = 0; if (msg != NULL && msg->header.typecookie == NGM_HCI_COOKIE) { mtx_lock(&ng_btsocket_sco_queue_mtx); if (NG_BT_ITEMQ_FULL(&ng_btsocket_sco_queue)) { NG_BTSOCKET_SCO_ERR( "%s: Input queue is full (msg)\n", __func__); NG_BT_ITEMQ_DROP(&ng_btsocket_sco_queue); NG_FREE_ITEM(item); error = ENOBUFS; } else { if (hook != NULL) { NG_HOOK_REF(hook); NGI_SET_HOOK(item, hook); } NG_BT_ITEMQ_ENQUEUE(&ng_btsocket_sco_queue, item); error = ng_btsocket_sco_wakeup_input_task(); } mtx_unlock(&ng_btsocket_sco_queue_mtx); } else { NG_FREE_ITEM(item); error = EINVAL; } return (error); } /* ng_btsocket_sco_node_rcvmsg */ /* * Receive data on a hook */ static int ng_btsocket_sco_node_rcvdata(hook_p hook, item_p item) { int error = 0; mtx_lock(&ng_btsocket_sco_queue_mtx); if (NG_BT_ITEMQ_FULL(&ng_btsocket_sco_queue)) { NG_BTSOCKET_SCO_ERR( "%s: Input queue is full (data)\n", __func__); NG_BT_ITEMQ_DROP(&ng_btsocket_sco_queue); NG_FREE_ITEM(item); error = ENOBUFS; } else { NG_HOOK_REF(hook); NGI_SET_HOOK(item, hook); NG_BT_ITEMQ_ENQUEUE(&ng_btsocket_sco_queue, item); error = ng_btsocket_sco_wakeup_input_task(); } mtx_unlock(&ng_btsocket_sco_queue_mtx); return (error); } /* ng_btsocket_sco_node_rcvdata */ /* * Process LP_ConnectCfm event from the lower layer protocol */ static int ng_btsocket_sco_process_lp_con_cfm(struct ng_mesg *msg, ng_btsocket_sco_rtentry_p rt) { ng_hci_lp_con_cfm_ep *ep = NULL; ng_btsocket_sco_pcb_t *pcb = NULL; int error = 0; if (msg->header.arglen != sizeof(*ep)) return (EMSGSIZE); ep = (ng_hci_lp_con_cfm_ep *)(msg->data); mtx_lock(&ng_btsocket_sco_sockets_mtx); /* Look for the socket with the token */ pcb = ng_btsocket_sco_pcb_by_addrs(&rt->src, &ep->bdaddr); if (pcb == NULL) { mtx_unlock(&ng_btsocket_sco_sockets_mtx); return (ENOENT); } /* pcb is locked */ NG_BTSOCKET_SCO_INFO( "%s: Got LP_ConnectCfm response, src bdaddr=%x:%x:%x:%x:%x:%x, " \ "dst bdaddr=%x:%x:%x:%x:%x:%x, status=%d, handle=%d, state=%d\n", __func__, pcb->src.b[5], pcb->src.b[4], pcb->src.b[3], pcb->src.b[2], pcb->src.b[1], pcb->src.b[0], pcb->dst.b[5], pcb->dst.b[4], pcb->dst.b[3], pcb->dst.b[2], pcb->dst.b[1], pcb->dst.b[0], ep->status, ep->con_handle, pcb->state); if (pcb->state != NG_BTSOCKET_SCO_CONNECTING) { mtx_unlock(&pcb->pcb_mtx); mtx_unlock(&ng_btsocket_sco_sockets_mtx); return (ENOENT); } ng_btsocket_sco_untimeout(pcb); if (ep->status == 0) { /* * Connection is open. Update connection handle and * socket state */ pcb->con_handle = ep->con_handle; pcb->state = NG_BTSOCKET_SCO_OPEN; soisconnected(pcb->so); } else { /* * We have failed to open connection, so disconnect the socket */ pcb->so->so_error = ECONNREFUSED; /* XXX convert status ??? */ pcb->state = NG_BTSOCKET_SCO_CLOSED; soisdisconnected(pcb->so); } mtx_unlock(&pcb->pcb_mtx); mtx_unlock(&ng_btsocket_sco_sockets_mtx); return (error); } /* ng_btsocket_sco_process_lp_con_cfm */ /* * Process LP_ConnectInd indicator. Find socket that listens on address. * Find exact or closest match. */ static int ng_btsocket_sco_process_lp_con_ind(struct ng_mesg *msg, ng_btsocket_sco_rtentry_p rt) { ng_hci_lp_con_ind_ep *ep = NULL; ng_btsocket_sco_pcb_t *pcb = NULL, *pcb1 = NULL; int error = 0; u_int16_t status = 0; if (msg->header.arglen != sizeof(*ep)) return (EMSGSIZE); ep = (ng_hci_lp_con_ind_ep *)(msg->data); NG_BTSOCKET_SCO_INFO( "%s: Got LP_ConnectInd indicator, src bdaddr=%x:%x:%x:%x:%x:%x, " \ "dst bdaddr=%x:%x:%x:%x:%x:%x\n", __func__, rt->src.b[5], rt->src.b[4], rt->src.b[3], rt->src.b[2], rt->src.b[1], rt->src.b[0], ep->bdaddr.b[5], ep->bdaddr.b[4], ep->bdaddr.b[3], ep->bdaddr.b[2], ep->bdaddr.b[1], ep->bdaddr.b[0]); mtx_lock(&ng_btsocket_sco_sockets_mtx); pcb = ng_btsocket_sco_pcb_by_addr(&rt->src); if (pcb != NULL) { struct socket *so1; /* pcb is locked */ CURVNET_SET(pcb->so->so_vnet); so1 = sonewconn(pcb->so, 0); CURVNET_RESTORE(); if (so1 == NULL) { status = 0x0d; /* Rejected due to limited resources */ goto respond; } /* * If we got here than we have created new socket. So complete * connection. If we we listening on specific address then copy * source address from listening socket, otherwise copy source * address from hook's routing information. */ pcb1 = so2sco_pcb(so1); KASSERT((pcb1 != NULL), ("%s: pcb1 == NULL\n", __func__)); mtx_lock(&pcb1->pcb_mtx); if (bcmp(&pcb->src, NG_HCI_BDADDR_ANY, sizeof(pcb->src)) != 0) bcopy(&pcb->src, &pcb1->src, sizeof(pcb1->src)); else bcopy(&rt->src, &pcb1->src, sizeof(pcb1->src)); pcb1->flags &= ~NG_BTSOCKET_SCO_CLIENT; bcopy(&ep->bdaddr, &pcb1->dst, sizeof(pcb1->dst)); pcb1->rt = rt; } else /* Nobody listens on requested BDADDR */ status = 0x1f; /* Unspecified Error */ respond: error = ng_btsocket_sco_send_lp_con_rsp(rt, &ep->bdaddr, status); if (pcb1 != NULL) { if (error != 0) { pcb1->so->so_error = error; pcb1->state = NG_BTSOCKET_SCO_CLOSED; soisdisconnected(pcb1->so); } else { pcb1->state = NG_BTSOCKET_SCO_CONNECTING; soisconnecting(pcb1->so); ng_btsocket_sco_timeout(pcb1); } mtx_unlock(&pcb1->pcb_mtx); } if (pcb != NULL) mtx_unlock(&pcb->pcb_mtx); mtx_unlock(&ng_btsocket_sco_sockets_mtx); return (error); } /* ng_btsocket_sco_process_lp_con_ind */ /* * Process LP_DisconnectInd indicator */ static int ng_btsocket_sco_process_lp_discon_ind(struct ng_mesg *msg, ng_btsocket_sco_rtentry_p rt) { ng_hci_lp_discon_ind_ep *ep = NULL; ng_btsocket_sco_pcb_t *pcb = NULL; /* Check message */ if (msg->header.arglen != sizeof(*ep)) return (EMSGSIZE); ep = (ng_hci_lp_discon_ind_ep *)(msg->data); mtx_lock(&ng_btsocket_sco_sockets_mtx); /* Look for the socket with given channel ID */ pcb = ng_btsocket_sco_pcb_by_handle(&rt->src, ep->con_handle); if (pcb == NULL) { mtx_unlock(&ng_btsocket_sco_sockets_mtx); return (0); } /* * Disconnect the socket. If there was any pending request we can * not do anything here anyway. */ /* pcb is locked */ NG_BTSOCKET_SCO_INFO( "%s: Got LP_DisconnectInd indicator, src bdaddr=%x:%x:%x:%x:%x:%x, " \ "dst bdaddr=%x:%x:%x:%x:%x:%x, handle=%d, state=%d\n", __func__, pcb->src.b[5], pcb->src.b[4], pcb->src.b[3], pcb->src.b[2], pcb->src.b[1], pcb->src.b[0], pcb->dst.b[5], pcb->dst.b[4], pcb->dst.b[3], pcb->dst.b[2], pcb->dst.b[1], pcb->dst.b[0], pcb->con_handle, pcb->state); if (pcb->flags & NG_BTSOCKET_SCO_TIMO) ng_btsocket_sco_untimeout(pcb); pcb->state = NG_BTSOCKET_SCO_CLOSED; soisdisconnected(pcb->so); mtx_unlock(&pcb->pcb_mtx); mtx_unlock(&ng_btsocket_sco_sockets_mtx); return (0); } /* ng_btsocket_sco_process_lp_discon_ind */ /* * Send LP_ConnectReq request */ static int ng_btsocket_sco_send_lp_con_req(ng_btsocket_sco_pcb_p pcb) { struct ng_mesg *msg = NULL; ng_hci_lp_con_req_ep *ep = NULL; int error = 0; mtx_assert(&pcb->pcb_mtx, MA_OWNED); if (pcb->rt == NULL || pcb->rt->hook == NULL || NG_HOOK_NOT_VALID(pcb->rt->hook)) return (ENETDOWN); NG_MKMESSAGE(msg, NGM_HCI_COOKIE, NGM_HCI_LP_CON_REQ, sizeof(*ep), M_NOWAIT); if (msg == NULL) return (ENOMEM); ep = (ng_hci_lp_con_req_ep *)(msg->data); ep->link_type = NG_HCI_LINK_SCO; bcopy(&pcb->dst, &ep->bdaddr, sizeof(ep->bdaddr)); NG_SEND_MSG_HOOK(error, ng_btsocket_sco_node, msg, pcb->rt->hook, 0); return (error); } /* ng_btsocket_sco_send_lp_con_req */ /* * Send LP_ConnectRsp response */ static int ng_btsocket_sco_send_lp_con_rsp(ng_btsocket_sco_rtentry_p rt, bdaddr_p dst, int status) { struct ng_mesg *msg = NULL; ng_hci_lp_con_rsp_ep *ep = NULL; int error = 0; if (rt == NULL || rt->hook == NULL || NG_HOOK_NOT_VALID(rt->hook)) return (ENETDOWN); NG_MKMESSAGE(msg, NGM_HCI_COOKIE, NGM_HCI_LP_CON_RSP, sizeof(*ep), M_NOWAIT); if (msg == NULL) return (ENOMEM); ep = (ng_hci_lp_con_rsp_ep *)(msg->data); ep->status = status; ep->link_type = NG_HCI_LINK_SCO; bcopy(dst, &ep->bdaddr, sizeof(ep->bdaddr)); NG_SEND_MSG_HOOK(error, ng_btsocket_sco_node, msg, rt->hook, 0); return (error); } /* ng_btsocket_sco_send_lp_con_rsp */ /* * Send LP_DisconReq request */ static int ng_btsocket_sco_send_lp_discon_req(ng_btsocket_sco_pcb_p pcb) { struct ng_mesg *msg = NULL; ng_hci_lp_discon_req_ep *ep = NULL; int error = 0; mtx_assert(&pcb->pcb_mtx, MA_OWNED); if (pcb->rt == NULL || pcb->rt->hook == NULL || NG_HOOK_NOT_VALID(pcb->rt->hook)) return (ENETDOWN); NG_MKMESSAGE(msg, NGM_HCI_COOKIE, NGM_HCI_LP_DISCON_REQ, sizeof(*ep), M_NOWAIT); if (msg == NULL) return (ENOMEM); ep = (ng_hci_lp_discon_req_ep *)(msg->data); ep->con_handle = pcb->con_handle; ep->reason = 0x13; /* User Ended Connection */ NG_SEND_MSG_HOOK(error, ng_btsocket_sco_node, msg, pcb->rt->hook, 0); return (error); } /* ng_btsocket_sco_send_lp_discon_req */ /***************************************************************************** ***************************************************************************** ** Socket interface ***************************************************************************** *****************************************************************************/ /* * SCO sockets data input routine */ static void ng_btsocket_sco_data_input(struct mbuf *m, hook_p hook) { ng_hci_scodata_pkt_t *hdr = NULL; ng_btsocket_sco_pcb_t *pcb = NULL; ng_btsocket_sco_rtentry_t *rt = NULL; u_int16_t con_handle; if (hook == NULL) { NG_BTSOCKET_SCO_ALERT( "%s: Invalid source hook for SCO data packet\n", __func__); goto drop; } rt = (ng_btsocket_sco_rtentry_t *) NG_HOOK_PRIVATE(hook); if (rt == NULL) { NG_BTSOCKET_SCO_ALERT( "%s: Could not find out source bdaddr for SCO data packet\n", __func__); goto drop; } /* Make sure we can access header */ if (m->m_pkthdr.len < sizeof(*hdr)) { NG_BTSOCKET_SCO_ERR( "%s: SCO data packet too small, len=%d\n", __func__, m->m_pkthdr.len); goto drop; } if (m->m_len < sizeof(*hdr)) { m = m_pullup(m, sizeof(*hdr)); if (m == NULL) goto drop; } /* Strip SCO packet header and verify packet length */ hdr = mtod(m, ng_hci_scodata_pkt_t *); m_adj(m, sizeof(*hdr)); if (hdr->length != m->m_pkthdr.len) { NG_BTSOCKET_SCO_ERR( "%s: Bad SCO data packet length, len=%d, length=%d\n", __func__, m->m_pkthdr.len, hdr->length); goto drop; } /* * Now process packet */ con_handle = NG_HCI_CON_HANDLE(le16toh(hdr->con_handle)); NG_BTSOCKET_SCO_INFO( "%s: Received SCO data packet: src bdaddr=%x:%x:%x:%x:%x:%x, handle=%d, " \ "length=%d\n", __func__, rt->src.b[5], rt->src.b[4], rt->src.b[3], rt->src.b[2], rt->src.b[1], rt->src.b[0], con_handle, hdr->length); mtx_lock(&ng_btsocket_sco_sockets_mtx); /* Find socket */ pcb = ng_btsocket_sco_pcb_by_handle(&rt->src, con_handle); if (pcb == NULL) { mtx_unlock(&ng_btsocket_sco_sockets_mtx); goto drop; } /* pcb is locked */ if (pcb->state != NG_BTSOCKET_SCO_OPEN) { NG_BTSOCKET_SCO_ERR( "%s: No connected socket found, src bdaddr=%x:%x:%x:%x:%x:%x, state=%d\n", __func__, rt->src.b[5], rt->src.b[4], rt->src.b[3], rt->src.b[2], rt->src.b[1], rt->src.b[0], pcb->state); mtx_unlock(&pcb->pcb_mtx); mtx_unlock(&ng_btsocket_sco_sockets_mtx); goto drop; } /* Check if we have enough space in socket receive queue */ if (m->m_pkthdr.len > sbspace(&pcb->so->so_rcv)) { NG_BTSOCKET_SCO_ERR( "%s: Not enough space in socket receive queue. Dropping SCO data packet, " \ "src bdaddr=%x:%x:%x:%x:%x:%x, len=%d, space=%ld\n", __func__, rt->src.b[5], rt->src.b[4], rt->src.b[3], rt->src.b[2], rt->src.b[1], rt->src.b[0], m->m_pkthdr.len, sbspace(&pcb->so->so_rcv)); mtx_unlock(&pcb->pcb_mtx); mtx_unlock(&ng_btsocket_sco_sockets_mtx); goto drop; } /* Append packet to the socket receive queue and wakeup */ sbappendrecord(&pcb->so->so_rcv, m); m = NULL; sorwakeup(pcb->so); mtx_unlock(&pcb->pcb_mtx); mtx_unlock(&ng_btsocket_sco_sockets_mtx); drop: NG_FREE_M(m); /* checks for m != NULL */ } /* ng_btsocket_sco_data_input */ /* * SCO sockets default message input routine */ static void ng_btsocket_sco_default_msg_input(struct ng_mesg *msg, hook_p hook) { ng_btsocket_sco_rtentry_t *rt = NULL; if (hook == NULL || NG_HOOK_NOT_VALID(hook)) return; rt = (ng_btsocket_sco_rtentry_t *) NG_HOOK_PRIVATE(hook); switch (msg->header.cmd) { case NGM_HCI_NODE_UP: { ng_hci_node_up_ep *ep = NULL; if (msg->header.arglen != sizeof(*ep)) break; ep = (ng_hci_node_up_ep *)(msg->data); if (bcmp(&ep->bdaddr, NG_HCI_BDADDR_ANY, sizeof(bdaddr_t)) == 0) break; if (rt == NULL) { rt = malloc(sizeof(*rt), M_NETGRAPH_BTSOCKET_SCO, M_NOWAIT|M_ZERO); if (rt == NULL) break; NG_HOOK_SET_PRIVATE(hook, rt); mtx_lock(&ng_btsocket_sco_rt_mtx); LIST_INSERT_HEAD(&ng_btsocket_sco_rt, rt, next); } else mtx_lock(&ng_btsocket_sco_rt_mtx); bcopy(&ep->bdaddr, &rt->src, sizeof(rt->src)); rt->pkt_size = (ep->pkt_size == 0)? 60 : ep->pkt_size; rt->num_pkts = ep->num_pkts; rt->hook = hook; mtx_unlock(&ng_btsocket_sco_rt_mtx); NG_BTSOCKET_SCO_INFO( "%s: Updating hook \"%s\", src bdaddr=%x:%x:%x:%x:%x:%x, pkt_size=%d, " \ "num_pkts=%d\n", __func__, NG_HOOK_NAME(hook), rt->src.b[5], rt->src.b[4], rt->src.b[3], rt->src.b[2], rt->src.b[1], rt->src.b[0], rt->pkt_size, rt->num_pkts); } break; case NGM_HCI_SYNC_CON_QUEUE: { ng_hci_sync_con_queue_ep *ep = NULL; ng_btsocket_sco_pcb_t *pcb = NULL; if (rt == NULL || msg->header.arglen != sizeof(*ep)) break; ep = (ng_hci_sync_con_queue_ep *)(msg->data); rt->pending -= ep->completed; if (rt->pending < 0) { NG_BTSOCKET_SCO_WARN( "%s: Pending packet counter is out of sync! bdaddr=%x:%x:%x:%x:%x:%x, " \ "handle=%d, pending=%d, completed=%d\n", __func__, rt->src.b[5], rt->src.b[4], rt->src.b[3], rt->src.b[2], rt->src.b[1], rt->src.b[0], ep->con_handle, rt->pending, ep->completed); rt->pending = 0; } mtx_lock(&ng_btsocket_sco_sockets_mtx); /* Find socket */ pcb = ng_btsocket_sco_pcb_by_handle(&rt->src, ep->con_handle); if (pcb == NULL) { mtx_unlock(&ng_btsocket_sco_sockets_mtx); break; } /* pcb is locked */ /* Check state */ if (pcb->state == NG_BTSOCKET_SCO_OPEN) { /* Remove timeout */ ng_btsocket_sco_untimeout(pcb); /* Drop completed packets from the send queue */ for (; ep->completed > 0; ep->completed --) sbdroprecord(&pcb->so->so_snd); /* Send more if we have any */ if (sbavail(&pcb->so->so_snd) > 0) if (ng_btsocket_sco_send2(pcb) == 0) ng_btsocket_sco_timeout(pcb); /* Wake up writers */ sowwakeup(pcb->so); } mtx_unlock(&pcb->pcb_mtx); mtx_unlock(&ng_btsocket_sco_sockets_mtx); } break; default: NG_BTSOCKET_SCO_WARN( "%s: Unknown message, cmd=%d\n", __func__, msg->header.cmd); break; } NG_FREE_MSG(msg); /* Checks for msg != NULL */ } /* ng_btsocket_sco_default_msg_input */ /* * SCO sockets LP message input routine */ static void ng_btsocket_sco_lp_msg_input(struct ng_mesg *msg, hook_p hook) { ng_btsocket_sco_rtentry_p rt = NULL; if (hook == NULL) { NG_BTSOCKET_SCO_ALERT( "%s: Invalid source hook for LP message\n", __func__); goto drop; } rt = (ng_btsocket_sco_rtentry_p) NG_HOOK_PRIVATE(hook); if (rt == NULL) { NG_BTSOCKET_SCO_ALERT( "%s: Could not find out source bdaddr for LP message\n", __func__); goto drop; } switch (msg->header.cmd) { case NGM_HCI_LP_CON_CFM: /* Connection Confirmation Event */ ng_btsocket_sco_process_lp_con_cfm(msg, rt); break; case NGM_HCI_LP_CON_IND: /* Connection Indication Event */ ng_btsocket_sco_process_lp_con_ind(msg, rt); break; case NGM_HCI_LP_DISCON_IND: /* Disconnection Indication Event */ ng_btsocket_sco_process_lp_discon_ind(msg, rt); break; /* XXX FIXME add other LP messages */ default: NG_BTSOCKET_SCO_WARN( "%s: Unknown LP message, cmd=%d\n", __func__, msg->header.cmd); break; } drop: NG_FREE_MSG(msg); } /* ng_btsocket_sco_lp_msg_input */ /* * SCO sockets input routine */ static void ng_btsocket_sco_input(void *context, int pending) { item_p item = NULL; hook_p hook = NULL; for (;;) { mtx_lock(&ng_btsocket_sco_queue_mtx); NG_BT_ITEMQ_DEQUEUE(&ng_btsocket_sco_queue, item); mtx_unlock(&ng_btsocket_sco_queue_mtx); if (item == NULL) break; NGI_GET_HOOK(item, hook); if (hook != NULL && NG_HOOK_NOT_VALID(hook)) goto drop; switch(item->el_flags & NGQF_TYPE) { case NGQF_DATA: { struct mbuf *m = NULL; NGI_GET_M(item, m); ng_btsocket_sco_data_input(m, hook); } break; case NGQF_MESG: { struct ng_mesg *msg = NULL; NGI_GET_MSG(item, msg); switch (msg->header.cmd) { case NGM_HCI_LP_CON_CFM: case NGM_HCI_LP_CON_IND: case NGM_HCI_LP_DISCON_IND: /* XXX FIXME add other LP messages */ ng_btsocket_sco_lp_msg_input(msg, hook); break; default: ng_btsocket_sco_default_msg_input(msg, hook); break; } } break; default: KASSERT(0, ("%s: invalid item type=%ld\n", __func__, (item->el_flags & NGQF_TYPE))); break; } drop: if (hook != NULL) NG_HOOK_UNREF(hook); NG_FREE_ITEM(item); } } /* ng_btsocket_sco_input */ /* * Route cleanup task. Gets scheduled when hook is disconnected. Here we * will find all sockets that use "invalid" hook and disconnect them. */ static void ng_btsocket_sco_rtclean(void *context, int pending) { ng_btsocket_sco_pcb_p pcb = NULL, pcb_next = NULL; ng_btsocket_sco_rtentry_p rt = NULL; /* * First disconnect all sockets that use "invalid" hook */ mtx_lock(&ng_btsocket_sco_sockets_mtx); for(pcb = LIST_FIRST(&ng_btsocket_sco_sockets); pcb != NULL; ) { mtx_lock(&pcb->pcb_mtx); pcb_next = LIST_NEXT(pcb, next); if (pcb->rt != NULL && pcb->rt->hook != NULL && NG_HOOK_NOT_VALID(pcb->rt->hook)) { if (pcb->flags & NG_BTSOCKET_SCO_TIMO) ng_btsocket_sco_untimeout(pcb); pcb->rt = NULL; pcb->so->so_error = ENETDOWN; pcb->state = NG_BTSOCKET_SCO_CLOSED; soisdisconnected(pcb->so); } mtx_unlock(&pcb->pcb_mtx); pcb = pcb_next; } mtx_unlock(&ng_btsocket_sco_sockets_mtx); /* * Now cleanup routing table */ mtx_lock(&ng_btsocket_sco_rt_mtx); for (rt = LIST_FIRST(&ng_btsocket_sco_rt); rt != NULL; ) { ng_btsocket_sco_rtentry_p rt_next = LIST_NEXT(rt, next); if (rt->hook != NULL && NG_HOOK_NOT_VALID(rt->hook)) { LIST_REMOVE(rt, next); NG_HOOK_SET_PRIVATE(rt->hook, NULL); NG_HOOK_UNREF(rt->hook); /* Remove extra reference */ bzero(rt, sizeof(*rt)); free(rt, M_NETGRAPH_BTSOCKET_SCO); } rt = rt_next; } mtx_unlock(&ng_btsocket_sco_rt_mtx); } /* ng_btsocket_sco_rtclean */ /* * Initialize everything */ void ng_btsocket_sco_init(void) { int error = 0; /* Skip initialization of globals for non-default instances. */ if (!IS_DEFAULT_VNET(curvnet)) return; ng_btsocket_sco_node = NULL; ng_btsocket_sco_debug_level = NG_BTSOCKET_WARN_LEVEL; /* Register Netgraph node type */ error = ng_newtype(&typestruct); if (error != 0) { NG_BTSOCKET_SCO_ALERT( "%s: Could not register Netgraph node type, error=%d\n", __func__, error); return; } /* Create Netgrapg node */ error = ng_make_node_common(&typestruct, &ng_btsocket_sco_node); if (error != 0) { NG_BTSOCKET_SCO_ALERT( "%s: Could not create Netgraph node, error=%d\n", __func__, error); ng_btsocket_sco_node = NULL; return; } error = ng_name_node(ng_btsocket_sco_node, NG_BTSOCKET_SCO_NODE_TYPE); if (error != 0) { NG_BTSOCKET_SCO_ALERT( "%s: Could not name Netgraph node, error=%d\n", __func__, error); NG_NODE_UNREF(ng_btsocket_sco_node); ng_btsocket_sco_node = NULL; return; } /* Create input queue */ NG_BT_ITEMQ_INIT(&ng_btsocket_sco_queue, 300); mtx_init(&ng_btsocket_sco_queue_mtx, "btsocks_sco_queue_mtx", NULL, MTX_DEF); TASK_INIT(&ng_btsocket_sco_queue_task, 0, ng_btsocket_sco_input, NULL); /* Create list of sockets */ LIST_INIT(&ng_btsocket_sco_sockets); mtx_init(&ng_btsocket_sco_sockets_mtx, "btsocks_sco_sockets_mtx", NULL, MTX_DEF); /* Routing table */ LIST_INIT(&ng_btsocket_sco_rt); mtx_init(&ng_btsocket_sco_rt_mtx, "btsocks_sco_rt_mtx", NULL, MTX_DEF); TASK_INIT(&ng_btsocket_sco_rt_task, 0, ng_btsocket_sco_rtclean, NULL); } /* ng_btsocket_sco_init */ /* * Abort connection on socket */ void ng_btsocket_sco_abort(struct socket *so) { so->so_error = ECONNABORTED; (void) ng_btsocket_sco_disconnect(so); } /* ng_btsocket_sco_abort */ void ng_btsocket_sco_close(struct socket *so) { (void) ng_btsocket_sco_disconnect(so); } /* ng_btsocket_sco_close */ /* * Accept connection on socket. Nothing to do here, socket must be connected * and ready, so just return peer address and be done with it. */ int ng_btsocket_sco_accept(struct socket *so, struct sockaddr **nam) { if (ng_btsocket_sco_node == NULL) return (EINVAL); return (ng_btsocket_sco_peeraddr(so, nam)); } /* ng_btsocket_sco_accept */ /* * Create and attach new socket */ int ng_btsocket_sco_attach(struct socket *so, int proto, struct thread *td) { ng_btsocket_sco_pcb_p pcb = so2sco_pcb(so); int error; /* Check socket and protocol */ if (ng_btsocket_sco_node == NULL) return (EPROTONOSUPPORT); if (so->so_type != SOCK_SEQPACKET) return (ESOCKTNOSUPPORT); #if 0 /* XXX sonewconn() calls "pru_attach" with proto == 0 */ if (proto != 0) if (proto != BLUETOOTH_PROTO_SCO) return (EPROTONOSUPPORT); #endif /* XXX */ if (pcb != NULL) return (EISCONN); /* Reserve send and receive space if it is not reserved yet */ if ((so->so_snd.sb_hiwat == 0) || (so->so_rcv.sb_hiwat == 0)) { error = soreserve(so, NG_BTSOCKET_SCO_SENDSPACE, NG_BTSOCKET_SCO_RECVSPACE); if (error != 0) return (error); } /* Allocate the PCB */ pcb = malloc(sizeof(*pcb), M_NETGRAPH_BTSOCKET_SCO, M_NOWAIT | M_ZERO); if (pcb == NULL) return (ENOMEM); /* Link the PCB and the socket */ so->so_pcb = (caddr_t) pcb; pcb->so = so; pcb->state = NG_BTSOCKET_SCO_CLOSED; callout_init(&pcb->timo, 1); /* * Mark PCB mutex as DUPOK to prevent "duplicated lock of * the same type" message. When accepting new SCO connection * ng_btsocket_sco_process_lp_con_ind() holds both PCB mutexes * for "old" (accepting) PCB and "new" (created) PCB. */ mtx_init(&pcb->pcb_mtx, "btsocks_sco_pcb_mtx", NULL, MTX_DEF|MTX_DUPOK); /* * Add the PCB to the list * * XXX FIXME VERY IMPORTANT! * * This is totally FUBAR. We could get here in two cases: * * 1) When user calls socket() * 2) When we need to accept new incoming connection and call * sonewconn() * * In the first case we must acquire ng_btsocket_sco_sockets_mtx. * In the second case we hold ng_btsocket_sco_sockets_mtx already. * So we now need to distinguish between these cases. From reading * /sys/kern/uipc_socket2.c we can find out that sonewconn() calls * pru_attach with proto == 0 and td == NULL. For now use this fact * to figure out if we were called from socket() or from sonewconn(). */ if (td != NULL) mtx_lock(&ng_btsocket_sco_sockets_mtx); else mtx_assert(&ng_btsocket_sco_sockets_mtx, MA_OWNED); LIST_INSERT_HEAD(&ng_btsocket_sco_sockets, pcb, next); if (td != NULL) mtx_unlock(&ng_btsocket_sco_sockets_mtx); return (0); } /* ng_btsocket_sco_attach */ /* * Bind socket */ int ng_btsocket_sco_bind(struct socket *so, struct sockaddr *nam, struct thread *td) { ng_btsocket_sco_pcb_t *pcb = NULL; struct sockaddr_sco *sa = (struct sockaddr_sco *) nam; if (ng_btsocket_sco_node == NULL) return (EINVAL); /* Verify address */ if (sa == NULL) return (EINVAL); if (sa->sco_family != AF_BLUETOOTH) return (EAFNOSUPPORT); if (sa->sco_len != sizeof(*sa)) return (EINVAL); mtx_lock(&ng_btsocket_sco_sockets_mtx); /* * Check if other socket has this address already (look for exact * match in bdaddr) and assign socket address if it's available. */ if (bcmp(&sa->sco_bdaddr, NG_HCI_BDADDR_ANY, sizeof(sa->sco_bdaddr)) != 0) { LIST_FOREACH(pcb, &ng_btsocket_sco_sockets, next) { mtx_lock(&pcb->pcb_mtx); if (bcmp(&pcb->src, &sa->sco_bdaddr, sizeof(bdaddr_t)) == 0) { mtx_unlock(&pcb->pcb_mtx); mtx_unlock(&ng_btsocket_sco_sockets_mtx); return (EADDRINUSE); } mtx_unlock(&pcb->pcb_mtx); } } pcb = so2sco_pcb(so); if (pcb == NULL) { mtx_unlock(&ng_btsocket_sco_sockets_mtx); return (EINVAL); } mtx_lock(&pcb->pcb_mtx); bcopy(&sa->sco_bdaddr, &pcb->src, sizeof(pcb->src)); mtx_unlock(&pcb->pcb_mtx); mtx_unlock(&ng_btsocket_sco_sockets_mtx); return (0); } /* ng_btsocket_sco_bind */ /* * Connect socket */ int ng_btsocket_sco_connect(struct socket *so, struct sockaddr *nam, struct thread *td) { ng_btsocket_sco_pcb_t *pcb = so2sco_pcb(so); struct sockaddr_sco *sa = (struct sockaddr_sco *) nam; ng_btsocket_sco_rtentry_t *rt = NULL; int have_src, error = 0; /* Check socket */ if (pcb == NULL) return (EINVAL); if (ng_btsocket_sco_node == NULL) return (EINVAL); /* Verify address */ if (sa == NULL) return (EINVAL); if (sa->sco_family != AF_BLUETOOTH) return (EAFNOSUPPORT); if (sa->sco_len != sizeof(*sa)) return (EINVAL); if (bcmp(&sa->sco_bdaddr, NG_HCI_BDADDR_ANY, sizeof(bdaddr_t)) == 0) return (EDESTADDRREQ); /* * Routing. Socket should be bound to some source address. The source * address can be ANY. Destination address must be set and it must not * be ANY. If source address is ANY then find first rtentry that has * src != dst. */ mtx_lock(&ng_btsocket_sco_rt_mtx); mtx_lock(&pcb->pcb_mtx); if (pcb->state == NG_BTSOCKET_SCO_CONNECTING) { mtx_unlock(&pcb->pcb_mtx); mtx_unlock(&ng_btsocket_sco_rt_mtx); return (EINPROGRESS); } if (bcmp(&sa->sco_bdaddr, &pcb->src, sizeof(pcb->src)) == 0) { mtx_unlock(&pcb->pcb_mtx); mtx_unlock(&ng_btsocket_sco_rt_mtx); return (EINVAL); } /* Send destination address and PSM */ bcopy(&sa->sco_bdaddr, &pcb->dst, sizeof(pcb->dst)); pcb->rt = NULL; have_src = bcmp(&pcb->src, NG_HCI_BDADDR_ANY, sizeof(pcb->src)); LIST_FOREACH(rt, &ng_btsocket_sco_rt, next) { if (rt->hook == NULL || NG_HOOK_NOT_VALID(rt->hook)) continue; /* Match src and dst */ if (have_src) { if (bcmp(&pcb->src, &rt->src, sizeof(rt->src)) == 0) break; } else { if (bcmp(&pcb->dst, &rt->src, sizeof(rt->src)) != 0) break; } } if (rt != NULL) { pcb->rt = rt; if (!have_src) bcopy(&rt->src, &pcb->src, sizeof(pcb->src)); } else error = EHOSTUNREACH; /* * Send LP_Connect request */ if (error == 0) { error = ng_btsocket_sco_send_lp_con_req(pcb); if (error == 0) { pcb->flags |= NG_BTSOCKET_SCO_CLIENT; pcb->state = NG_BTSOCKET_SCO_CONNECTING; soisconnecting(pcb->so); ng_btsocket_sco_timeout(pcb); } } mtx_unlock(&pcb->pcb_mtx); mtx_unlock(&ng_btsocket_sco_rt_mtx); return (error); } /* ng_btsocket_sco_connect */ /* * Process ioctl's calls on socket */ int ng_btsocket_sco_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, struct thread *td) { return (EINVAL); } /* ng_btsocket_sco_control */ /* * Process getsockopt/setsockopt system calls */ int ng_btsocket_sco_ctloutput(struct socket *so, struct sockopt *sopt) { ng_btsocket_sco_pcb_p pcb = so2sco_pcb(so); int error, tmp; if (ng_btsocket_sco_node == NULL) return (EINVAL); if (pcb == NULL) return (EINVAL); if (sopt->sopt_level != SOL_SCO) return (0); mtx_lock(&pcb->pcb_mtx); switch (sopt->sopt_dir) { case SOPT_GET: if (pcb->state != NG_BTSOCKET_SCO_OPEN) { error = ENOTCONN; break; } switch (sopt->sopt_name) { case SO_SCO_MTU: tmp = pcb->rt->pkt_size; error = sooptcopyout(sopt, &tmp, sizeof(tmp)); break; case SO_SCO_CONNINFO: tmp = pcb->con_handle; error = sooptcopyout(sopt, &tmp, sizeof(tmp)); break; default: error = EINVAL; break; } break; case SOPT_SET: error = ENOPROTOOPT; break; default: error = EINVAL; break; } mtx_unlock(&pcb->pcb_mtx); return (error); } /* ng_btsocket_sco_ctloutput */ /* * Detach and destroy socket */ void ng_btsocket_sco_detach(struct socket *so) { ng_btsocket_sco_pcb_p pcb = so2sco_pcb(so); KASSERT(pcb != NULL, ("ng_btsocket_sco_detach: pcb == NULL")); if (ng_btsocket_sco_node == NULL) return; mtx_lock(&ng_btsocket_sco_sockets_mtx); mtx_lock(&pcb->pcb_mtx); if (pcb->flags & NG_BTSOCKET_SCO_TIMO) ng_btsocket_sco_untimeout(pcb); if (pcb->state == NG_BTSOCKET_SCO_OPEN) ng_btsocket_sco_send_lp_discon_req(pcb); pcb->state = NG_BTSOCKET_SCO_CLOSED; LIST_REMOVE(pcb, next); mtx_unlock(&pcb->pcb_mtx); mtx_unlock(&ng_btsocket_sco_sockets_mtx); mtx_destroy(&pcb->pcb_mtx); bzero(pcb, sizeof(*pcb)); free(pcb, M_NETGRAPH_BTSOCKET_SCO); soisdisconnected(so); so->so_pcb = NULL; } /* ng_btsocket_sco_detach */ /* * Disconnect socket */ int ng_btsocket_sco_disconnect(struct socket *so) { ng_btsocket_sco_pcb_p pcb = so2sco_pcb(so); if (pcb == NULL) return (EINVAL); if (ng_btsocket_sco_node == NULL) return (EINVAL); mtx_lock(&pcb->pcb_mtx); if (pcb->state == NG_BTSOCKET_SCO_DISCONNECTING) { mtx_unlock(&pcb->pcb_mtx); return (EINPROGRESS); } if (pcb->flags & NG_BTSOCKET_SCO_TIMO) ng_btsocket_sco_untimeout(pcb); if (pcb->state == NG_BTSOCKET_SCO_OPEN) { ng_btsocket_sco_send_lp_discon_req(pcb); pcb->state = NG_BTSOCKET_SCO_DISCONNECTING; soisdisconnecting(so); ng_btsocket_sco_timeout(pcb); } else { pcb->state = NG_BTSOCKET_SCO_CLOSED; soisdisconnected(so); } mtx_unlock(&pcb->pcb_mtx); return (0); } /* ng_btsocket_sco_disconnect */ /* * Listen on socket */ int ng_btsocket_sco_listen(struct socket *so, int backlog, struct thread *td) { ng_btsocket_sco_pcb_p pcb = so2sco_pcb(so); int error; if (pcb == NULL) return (EINVAL); if (ng_btsocket_sco_node == NULL) return (EINVAL); SOCK_LOCK(so); mtx_lock(&pcb->pcb_mtx); error = solisten_proto_check(so); if (error != 0) goto out; #if 0 if (bcmp(&pcb->src, NG_HCI_BDADDR_ANY, sizeof(bdaddr_t)) == 0) { error = EDESTADDRREQ; goto out; } #endif solisten_proto(so, backlog); out: mtx_unlock(&pcb->pcb_mtx); SOCK_UNLOCK(so); return (error); } /* ng_btsocket_listen */ /* * Get peer address */ int ng_btsocket_sco_peeraddr(struct socket *so, struct sockaddr **nam) { ng_btsocket_sco_pcb_p pcb = so2sco_pcb(so); struct sockaddr_sco sa; if (pcb == NULL) return (EINVAL); if (ng_btsocket_sco_node == NULL) return (EINVAL); mtx_lock(&pcb->pcb_mtx); bcopy(&pcb->dst, &sa.sco_bdaddr, sizeof(sa.sco_bdaddr)); mtx_unlock(&pcb->pcb_mtx); sa.sco_len = sizeof(sa); sa.sco_family = AF_BLUETOOTH; *nam = sodupsockaddr((struct sockaddr *) &sa, M_NOWAIT); return ((*nam == NULL)? ENOMEM : 0); } /* ng_btsocket_sco_peeraddr */ /* * Send data to socket */ int ng_btsocket_sco_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, struct mbuf *control, struct thread *td) { ng_btsocket_sco_pcb_t *pcb = so2sco_pcb(so); int error = 0; if (ng_btsocket_sco_node == NULL) { error = ENETDOWN; goto drop; } /* Check socket and input */ if (pcb == NULL || m == NULL || control != NULL) { error = EINVAL; goto drop; } mtx_lock(&pcb->pcb_mtx); /* Make sure socket is connected */ if (pcb->state != NG_BTSOCKET_SCO_OPEN) { mtx_unlock(&pcb->pcb_mtx); error = ENOTCONN; goto drop; } /* Check route */ if (pcb->rt == NULL || pcb->rt->hook == NULL || NG_HOOK_NOT_VALID(pcb->rt->hook)) { mtx_unlock(&pcb->pcb_mtx); error = ENETDOWN; goto drop; } /* Check packet size */ if (m->m_pkthdr.len > pcb->rt->pkt_size) { NG_BTSOCKET_SCO_ERR( "%s: Packet too big, len=%d, pkt_size=%d\n", __func__, m->m_pkthdr.len, pcb->rt->pkt_size); mtx_unlock(&pcb->pcb_mtx); error = EMSGSIZE; goto drop; } /* * First put packet on socket send queue. Then check if we have * pending timeout. If we do not have timeout then we must send * packet and schedule timeout. Otherwise do nothing and wait for * NGM_HCI_SYNC_CON_QUEUE message. */ sbappendrecord(&pcb->so->so_snd, m); m = NULL; if (!(pcb->flags & NG_BTSOCKET_SCO_TIMO)) { error = ng_btsocket_sco_send2(pcb); if (error == 0) ng_btsocket_sco_timeout(pcb); else sbdroprecord(&pcb->so->so_snd); /* XXX */ } mtx_unlock(&pcb->pcb_mtx); drop: NG_FREE_M(m); /* checks for != NULL */ NG_FREE_M(control); return (error); } /* ng_btsocket_sco_send */ /* * Send first packet in the socket queue to the SCO layer */ static int ng_btsocket_sco_send2(ng_btsocket_sco_pcb_p pcb) { struct mbuf *m = NULL; ng_hci_scodata_pkt_t *hdr = NULL; int error = 0; mtx_assert(&pcb->pcb_mtx, MA_OWNED); while (pcb->rt->pending < pcb->rt->num_pkts && sbavail(&pcb->so->so_snd) > 0) { /* Get a copy of the first packet on send queue */ m = m_dup(pcb->so->so_snd.sb_mb, M_NOWAIT); if (m == NULL) { error = ENOBUFS; break; } /* Create SCO packet header */ M_PREPEND(m, sizeof(*hdr), M_NOWAIT); if (m != NULL) if (m->m_len < sizeof(*hdr)) m = m_pullup(m, sizeof(*hdr)); if (m == NULL) { error = ENOBUFS; break; } /* Fill in the header */ hdr = mtod(m, ng_hci_scodata_pkt_t *); hdr->type = NG_HCI_SCO_DATA_PKT; hdr->con_handle = htole16(NG_HCI_MK_CON_HANDLE(pcb->con_handle, 0, 0)); hdr->length = m->m_pkthdr.len - sizeof(*hdr); /* Send packet */ NG_SEND_DATA_ONLY(error, pcb->rt->hook, m); if (error != 0) break; pcb->rt->pending ++; } return ((pcb->rt->pending > 0)? 0 : error); } /* ng_btsocket_sco_send2 */ /* * Get socket address */ int ng_btsocket_sco_sockaddr(struct socket *so, struct sockaddr **nam) { ng_btsocket_sco_pcb_p pcb = so2sco_pcb(so); struct sockaddr_sco sa; if (pcb == NULL) return (EINVAL); if (ng_btsocket_sco_node == NULL) return (EINVAL); mtx_lock(&pcb->pcb_mtx); bcopy(&pcb->src, &sa.sco_bdaddr, sizeof(sa.sco_bdaddr)); mtx_unlock(&pcb->pcb_mtx); sa.sco_len = sizeof(sa); sa.sco_family = AF_BLUETOOTH; *nam = sodupsockaddr((struct sockaddr *) &sa, M_NOWAIT); return ((*nam == NULL)? ENOMEM : 0); } /* ng_btsocket_sco_sockaddr */ /***************************************************************************** ***************************************************************************** ** Misc. functions ***************************************************************************** *****************************************************************************/ /* * Look for the socket that listens on given bdaddr. * Returns exact or close match (if any). * Caller must hold ng_btsocket_sco_sockets_mtx. * Returns with locked pcb. */ static ng_btsocket_sco_pcb_p ng_btsocket_sco_pcb_by_addr(bdaddr_p bdaddr) { ng_btsocket_sco_pcb_p p = NULL, p1 = NULL; mtx_assert(&ng_btsocket_sco_sockets_mtx, MA_OWNED); LIST_FOREACH(p, &ng_btsocket_sco_sockets, next) { mtx_lock(&p->pcb_mtx); - if (p->so == NULL || !(p->so->so_options & SO_ACCEPTCONN)) { + if (p->so == NULL || !SOLISTENING(p->so)) { mtx_unlock(&p->pcb_mtx); continue; } if (bcmp(&p->src, bdaddr, sizeof(p->src)) == 0) return (p); /* return with locked pcb */ if (bcmp(&p->src, NG_HCI_BDADDR_ANY, sizeof(p->src)) == 0) p1 = p; mtx_unlock(&p->pcb_mtx); } if (p1 != NULL) mtx_lock(&p1->pcb_mtx); return (p1); } /* ng_btsocket_sco_pcb_by_addr */ /* * Look for the socket that assigned to given source address and handle. * Caller must hold ng_btsocket_sco_sockets_mtx. * Returns with locked pcb. */ static ng_btsocket_sco_pcb_p ng_btsocket_sco_pcb_by_handle(bdaddr_p src, int con_handle) { ng_btsocket_sco_pcb_p p = NULL; mtx_assert(&ng_btsocket_sco_sockets_mtx, MA_OWNED); LIST_FOREACH(p, &ng_btsocket_sco_sockets, next) { mtx_lock(&p->pcb_mtx); if (p->con_handle == con_handle && bcmp(src, &p->src, sizeof(p->src)) == 0) return (p); /* return with locked pcb */ mtx_unlock(&p->pcb_mtx); } return (NULL); } /* ng_btsocket_sco_pcb_by_handle */ /* * Look for the socket in CONNECTING state with given source and destination * addresses. Caller must hold ng_btsocket_sco_sockets_mtx. * Returns with locked pcb. */ static ng_btsocket_sco_pcb_p ng_btsocket_sco_pcb_by_addrs(bdaddr_p src, bdaddr_p dst) { ng_btsocket_sco_pcb_p p = NULL; mtx_assert(&ng_btsocket_sco_sockets_mtx, MA_OWNED); LIST_FOREACH(p, &ng_btsocket_sco_sockets, next) { mtx_lock(&p->pcb_mtx); if (p->state == NG_BTSOCKET_SCO_CONNECTING && bcmp(src, &p->src, sizeof(p->src)) == 0 && bcmp(dst, &p->dst, sizeof(p->dst)) == 0) return (p); /* return with locked pcb */ mtx_unlock(&p->pcb_mtx); } return (NULL); } /* ng_btsocket_sco_pcb_by_addrs */ /* * Set timeout on socket */ static void ng_btsocket_sco_timeout(ng_btsocket_sco_pcb_p pcb) { mtx_assert(&pcb->pcb_mtx, MA_OWNED); if (!(pcb->flags & NG_BTSOCKET_SCO_TIMO)) { pcb->flags |= NG_BTSOCKET_SCO_TIMO; callout_reset(&pcb->timo, bluetooth_sco_rtx_timeout(), ng_btsocket_sco_process_timeout, pcb); } else KASSERT(0, ("%s: Duplicated socket timeout?!\n", __func__)); } /* ng_btsocket_sco_timeout */ /* * Unset timeout on socket */ static void ng_btsocket_sco_untimeout(ng_btsocket_sco_pcb_p pcb) { mtx_assert(&pcb->pcb_mtx, MA_OWNED); if (pcb->flags & NG_BTSOCKET_SCO_TIMO) { callout_stop(&pcb->timo); pcb->flags &= ~NG_BTSOCKET_SCO_TIMO; } else KASSERT(0, ("%s: No socket timeout?!\n", __func__)); } /* ng_btsocket_sco_untimeout */ /* * Process timeout on socket */ static void ng_btsocket_sco_process_timeout(void *xpcb) { ng_btsocket_sco_pcb_p pcb = (ng_btsocket_sco_pcb_p) xpcb; mtx_lock(&pcb->pcb_mtx); pcb->flags &= ~NG_BTSOCKET_SCO_TIMO; pcb->so->so_error = ETIMEDOUT; switch (pcb->state) { case NG_BTSOCKET_SCO_CONNECTING: /* Connect timeout - close the socket */ pcb->state = NG_BTSOCKET_SCO_CLOSED; soisdisconnected(pcb->so); break; case NG_BTSOCKET_SCO_OPEN: /* Send timeout - did not get NGM_HCI_SYNC_CON_QUEUE */ sbdroprecord(&pcb->so->so_snd); sowwakeup(pcb->so); /* XXX FIXME what to do with pcb->rt->pending??? */ break; case NG_BTSOCKET_SCO_DISCONNECTING: /* Disconnect timeout - disconnect the socket anyway */ pcb->state = NG_BTSOCKET_SCO_CLOSED; soisdisconnected(pcb->so); break; default: NG_BTSOCKET_SCO_ERR( "%s: Invalid socket state=%d\n", __func__, pcb->state); break; } mtx_unlock(&pcb->pcb_mtx); } /* ng_btsocket_sco_process_timeout */ diff --git a/sys/netinet/tcp_input.c b/sys/netinet/tcp_input.c index e71a11bdef05..1d685b43697f 100644 --- a/sys/netinet/tcp_input.c +++ b/sys/netinet/tcp_input.c @@ -1,4084 +1,4083 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995 * The Regents of the University of California. All rights reserved. * Copyright (c) 2007-2008,2010 * Swinburne University of Technology, Melbourne, Australia. * Copyright (c) 2009-2010 Lawrence Stewart * Copyright (c) 2010 The FreeBSD Foundation * Copyright (c) 2010-2011 Juniper Networks, Inc. * All rights reserved. * * Portions of this software were developed at the Centre for Advanced Internet * Architectures, Swinburne University of Technology, by Lawrence Stewart, * James Healy and David Hayes, made possible in part by a grant from the Cisco * University Research Program Fund at Community Foundation Silicon Valley. * * Portions of this software were developed at the Centre for Advanced * Internet Architectures, Swinburne University of Technology, Melbourne, * Australia by David Hayes under sponsorship from the FreeBSD Foundation. * * Portions of this software were developed by Robert N. M. Watson under * contract to Juniper Networks, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)tcp_input.c 8.12 (Berkeley) 5/24/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_tcpdebug.h" #include #include #include #ifdef TCP_HHOOK #include #endif #include #include #include /* for proc0 declaration */ #include #include #include #include #include #include #include #include #include #include #include /* before tcp_seq.h, for tcp_random18() */ #include #include #include #include #include #define TCPSTATES /* for logging */ #include #include #include #include #include #include /* required for icmp_var.h */ #include /* for ICMP_BANDLIM */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef TCPPCAP #include #endif #include #ifdef TCPDEBUG #include #endif /* TCPDEBUG */ #ifdef TCP_OFFLOAD #include #endif #include #include #include #include const int tcprexmtthresh = 3; VNET_DEFINE(int, tcp_log_in_vain) = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_in_vain, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_log_in_vain), 0, "Log all incoming TCP segments to closed ports"); VNET_DEFINE(int, blackhole) = 0; #define V_blackhole VNET(blackhole) SYSCTL_INT(_net_inet_tcp, OID_AUTO, blackhole, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(blackhole), 0, "Do not send RST on segments to closed ports"); VNET_DEFINE(int, tcp_delack_enabled) = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, delayed_ack, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_delack_enabled), 0, "Delay ACK to try and piggyback it onto a data packet"); VNET_DEFINE(int, drop_synfin) = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, drop_synfin, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(drop_synfin), 0, "Drop TCP packets with SYN+FIN set"); VNET_DEFINE(int, tcp_do_prr_conservative) = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, do_prr_conservative, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_do_prr_conservative), 0, "Do conservative Proportional Rate Reduction"); VNET_DEFINE(int, tcp_do_prr) = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, do_prr, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_do_prr), 1, "Enable Proportional Rate Reduction per RFC 6937"); VNET_DEFINE(int, tcp_do_lrd) = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, do_lrd, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_do_lrd), 1, "Perform Lost Retransmission Detection"); VNET_DEFINE(int, tcp_do_newcwv) = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, newcwv, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_do_newcwv), 0, "Enable New Congestion Window Validation per RFC7661"); VNET_DEFINE(int, tcp_do_rfc3042) = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3042, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_do_rfc3042), 0, "Enable RFC 3042 (Limited Transmit)"); VNET_DEFINE(int, tcp_do_rfc3390) = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3390, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_do_rfc3390), 0, "Enable RFC 3390 (Increasing TCP's Initial Congestion Window)"); VNET_DEFINE(int, tcp_initcwnd_segments) = 10; SYSCTL_INT(_net_inet_tcp, OID_AUTO, initcwnd_segments, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_initcwnd_segments), 0, "Slow-start flight size (initial congestion window) in number of segments"); VNET_DEFINE(int, tcp_do_rfc3465) = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3465, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_do_rfc3465), 0, "Enable RFC 3465 (Appropriate Byte Counting)"); VNET_DEFINE(int, tcp_abc_l_var) = 2; SYSCTL_INT(_net_inet_tcp, OID_AUTO, abc_l_var, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_abc_l_var), 2, "Cap the max cwnd increment during slow-start to this number of segments"); static SYSCTL_NODE(_net_inet_tcp, OID_AUTO, ecn, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "TCP ECN"); VNET_DEFINE(int, tcp_do_ecn) = 2; SYSCTL_INT(_net_inet_tcp_ecn, OID_AUTO, enable, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_do_ecn), 0, "TCP ECN support"); VNET_DEFINE(int, tcp_ecn_maxretries) = 1; SYSCTL_INT(_net_inet_tcp_ecn, OID_AUTO, maxretries, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_ecn_maxretries), 0, "Max retries before giving up on ECN"); VNET_DEFINE(int, tcp_insecure_syn) = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, insecure_syn, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_insecure_syn), 0, "Follow RFC793 instead of RFC5961 criteria for accepting SYN packets"); VNET_DEFINE(int, tcp_insecure_rst) = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, insecure_rst, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_insecure_rst), 0, "Follow RFC793 instead of RFC5961 criteria for accepting RST packets"); VNET_DEFINE(int, tcp_recvspace) = 1024*64; #define V_tcp_recvspace VNET(tcp_recvspace) SYSCTL_INT(_net_inet_tcp, TCPCTL_RECVSPACE, recvspace, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_recvspace), 0, "Initial receive socket buffer size"); VNET_DEFINE(int, tcp_do_autorcvbuf) = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_auto, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_do_autorcvbuf), 0, "Enable automatic receive buffer sizing"); VNET_DEFINE(int, tcp_autorcvbuf_max) = 2*1024*1024; SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_max, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_autorcvbuf_max), 0, "Max size of automatic receive buffer"); VNET_DEFINE(struct inpcbhead, tcb); #define tcb6 tcb /* for KAME src sync over BSD*'s */ VNET_DEFINE(struct inpcbinfo, tcbinfo); /* * TCP statistics are stored in an array of counter(9)s, which size matches * size of struct tcpstat. TCP running connection count is a regular array. */ VNET_PCPUSTAT_DEFINE(struct tcpstat, tcpstat); SYSCTL_VNET_PCPUSTAT(_net_inet_tcp, TCPCTL_STATS, stats, struct tcpstat, tcpstat, "TCP statistics (struct tcpstat, netinet/tcp_var.h)"); VNET_DEFINE(counter_u64_t, tcps_states[TCP_NSTATES]); SYSCTL_COUNTER_U64_ARRAY(_net_inet_tcp, TCPCTL_STATES, states, CTLFLAG_RD | CTLFLAG_VNET, &VNET_NAME(tcps_states)[0], TCP_NSTATES, "TCP connection counts by TCP state"); static void tcp_vnet_init(const void *unused) { COUNTER_ARRAY_ALLOC(V_tcps_states, TCP_NSTATES, M_WAITOK); VNET_PCPUSTAT_ALLOC(tcpstat, M_WAITOK); } VNET_SYSINIT(tcp_vnet_init, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY, tcp_vnet_init, NULL); #ifdef VIMAGE static void tcp_vnet_uninit(const void *unused) { COUNTER_ARRAY_FREE(V_tcps_states, TCP_NSTATES); VNET_PCPUSTAT_FREE(tcpstat); } VNET_SYSUNINIT(tcp_vnet_uninit, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY, tcp_vnet_uninit, NULL); #endif /* VIMAGE */ /* * Kernel module interface for updating tcpstat. The first argument is an index * into tcpstat treated as an array. */ void kmod_tcpstat_add(int statnum, int val) { counter_u64_add(VNET(tcpstat)[statnum], val); } #ifdef TCP_HHOOK /* * Wrapper for the TCP established input helper hook. */ void hhook_run_tcp_est_in(struct tcpcb *tp, struct tcphdr *th, struct tcpopt *to) { struct tcp_hhook_data hhook_data; if (V_tcp_hhh[HHOOK_TCP_EST_IN]->hhh_nhooks > 0) { hhook_data.tp = tp; hhook_data.th = th; hhook_data.to = to; hhook_run_hooks(V_tcp_hhh[HHOOK_TCP_EST_IN], &hhook_data, tp->osd); } } #endif /* * CC wrapper hook functions */ void cc_ack_received(struct tcpcb *tp, struct tcphdr *th, uint16_t nsegs, uint16_t type) { #ifdef STATS int32_t gput; #endif INP_WLOCK_ASSERT(tp->t_inpcb); tp->ccv->nsegs = nsegs; tp->ccv->bytes_this_ack = BYTES_THIS_ACK(tp, th); if ((!V_tcp_do_newcwv && (tp->snd_cwnd <= tp->snd_wnd)) || (V_tcp_do_newcwv && (tp->snd_cwnd <= tp->snd_wnd) && (tp->snd_cwnd < (tcp_compute_pipe(tp) * 2)))) tp->ccv->flags |= CCF_CWND_LIMITED; else tp->ccv->flags &= ~CCF_CWND_LIMITED; if (type == CC_ACK) { #ifdef STATS stats_voi_update_abs_s32(tp->t_stats, VOI_TCP_CALCFRWINDIFF, ((int32_t)tp->snd_cwnd) - tp->snd_wnd); if (!IN_RECOVERY(tp->t_flags)) stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_ACKLEN, tp->ccv->bytes_this_ack / (tcp_maxseg(tp) * nsegs)); if ((tp->t_flags & TF_GPUTINPROG) && SEQ_GEQ(th->th_ack, tp->gput_ack)) { /* * Compute goodput in bits per millisecond. */ gput = (((int64_t)(th->th_ack - tp->gput_seq)) << 3) / max(1, tcp_ts_getticks() - tp->gput_ts); stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_GPUT, gput); /* * XXXLAS: This is a temporary hack, and should be * chained off VOI_TCP_GPUT when stats(9) grows an API * to deal with chained VOIs. */ if (tp->t_stats_gput_prev > 0) stats_voi_update_abs_s32(tp->t_stats, VOI_TCP_GPUT_ND, ((gput - tp->t_stats_gput_prev) * 100) / tp->t_stats_gput_prev); tp->t_flags &= ~TF_GPUTINPROG; tp->t_stats_gput_prev = gput; } #endif /* STATS */ if (tp->snd_cwnd > tp->snd_ssthresh) { tp->t_bytes_acked += tp->ccv->bytes_this_ack; if (tp->t_bytes_acked >= tp->snd_cwnd) { tp->t_bytes_acked -= tp->snd_cwnd; tp->ccv->flags |= CCF_ABC_SENTAWND; } } else { tp->ccv->flags &= ~CCF_ABC_SENTAWND; tp->t_bytes_acked = 0; } } if (CC_ALGO(tp)->ack_received != NULL) { /* XXXLAS: Find a way to live without this */ tp->ccv->curack = th->th_ack; CC_ALGO(tp)->ack_received(tp->ccv, type); } #ifdef STATS stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_LCWIN, tp->snd_cwnd); #endif } void cc_conn_init(struct tcpcb *tp) { struct hc_metrics_lite metrics; struct inpcb *inp = tp->t_inpcb; u_int maxseg; int rtt; INP_WLOCK_ASSERT(tp->t_inpcb); tcp_hc_get(&inp->inp_inc, &metrics); maxseg = tcp_maxseg(tp); if (tp->t_srtt == 0 && (rtt = metrics.rmx_rtt)) { tp->t_srtt = rtt; tp->t_rttbest = tp->t_srtt + TCP_RTT_SCALE; TCPSTAT_INC(tcps_usedrtt); if (metrics.rmx_rttvar) { tp->t_rttvar = metrics.rmx_rttvar; TCPSTAT_INC(tcps_usedrttvar); } else { /* default variation is +- 1 rtt */ tp->t_rttvar = tp->t_srtt * TCP_RTTVAR_SCALE / TCP_RTT_SCALE; } TCPT_RANGESET(tp->t_rxtcur, ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1, tp->t_rttmin, TCPTV_REXMTMAX); } if (metrics.rmx_ssthresh) { /* * There's some sort of gateway or interface * buffer limit on the path. Use this to set * the slow start threshold, but set the * threshold to no less than 2*mss. */ tp->snd_ssthresh = max(2 * maxseg, metrics.rmx_ssthresh); TCPSTAT_INC(tcps_usedssthresh); } /* * Set the initial slow-start flight size. * * If a SYN or SYN/ACK was lost and retransmitted, we have to * reduce the initial CWND to one segment as congestion is likely * requiring us to be cautious. */ if (tp->snd_cwnd == 1) tp->snd_cwnd = maxseg; /* SYN(-ACK) lost */ else tp->snd_cwnd = tcp_compute_initwnd(maxseg); if (CC_ALGO(tp)->conn_init != NULL) CC_ALGO(tp)->conn_init(tp->ccv); } void inline cc_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type) { INP_WLOCK_ASSERT(tp->t_inpcb); #ifdef STATS stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_CSIG, type); #endif switch(type) { case CC_NDUPACK: if (!IN_FASTRECOVERY(tp->t_flags)) { tp->snd_recover = tp->snd_max; if (tp->t_flags2 & TF2_ECN_PERMIT) tp->t_flags2 |= TF2_ECN_SND_CWR; } break; case CC_ECN: if (!IN_CONGRECOVERY(tp->t_flags) || /* * Allow ECN reaction on ACK to CWR, if * that data segment was also CE marked. */ SEQ_GEQ(th->th_ack, tp->snd_recover)) { EXIT_CONGRECOVERY(tp->t_flags); TCPSTAT_INC(tcps_ecn_rcwnd); tp->snd_recover = tp->snd_max + 1; if (tp->t_flags2 & TF2_ECN_PERMIT) tp->t_flags2 |= TF2_ECN_SND_CWR; } break; case CC_RTO: tp->t_dupacks = 0; tp->t_bytes_acked = 0; EXIT_RECOVERY(tp->t_flags); if (tp->t_flags2 & TF2_ECN_PERMIT) tp->t_flags2 |= TF2_ECN_SND_CWR; break; case CC_RTO_ERR: TCPSTAT_INC(tcps_sndrexmitbad); /* RTO was unnecessary, so reset everything. */ tp->snd_cwnd = tp->snd_cwnd_prev; tp->snd_ssthresh = tp->snd_ssthresh_prev; tp->snd_recover = tp->snd_recover_prev; if (tp->t_flags & TF_WASFRECOVERY) ENTER_FASTRECOVERY(tp->t_flags); if (tp->t_flags & TF_WASCRECOVERY) ENTER_CONGRECOVERY(tp->t_flags); tp->snd_nxt = tp->snd_max; tp->t_flags &= ~TF_PREVVALID; tp->t_badrxtwin = 0; break; } if (CC_ALGO(tp)->cong_signal != NULL) { if (th != NULL) tp->ccv->curack = th->th_ack; CC_ALGO(tp)->cong_signal(tp->ccv, type); } } void inline cc_post_recovery(struct tcpcb *tp, struct tcphdr *th) { INP_WLOCK_ASSERT(tp->t_inpcb); /* XXXLAS: KASSERT that we're in recovery? */ if (CC_ALGO(tp)->post_recovery != NULL) { tp->ccv->curack = th->th_ack; CC_ALGO(tp)->post_recovery(tp->ccv); } /* XXXLAS: EXIT_RECOVERY ? */ tp->t_bytes_acked = 0; tp->sackhint.delivered_data = 0; tp->sackhint.prr_out = 0; } /* * Indicate whether this ack should be delayed. We can delay the ack if * following conditions are met: * - There is no delayed ack timer in progress. * - Our last ack wasn't a 0-sized window. We never want to delay * the ack that opens up a 0-sized window. * - LRO wasn't used for this segment. We make sure by checking that the * segment size is not larger than the MSS. */ #define DELAY_ACK(tp, tlen) \ ((!tcp_timer_active(tp, TT_DELACK) && \ (tp->t_flags & TF_RXWIN0SENT) == 0) && \ (tlen <= tp->t_maxseg) && \ (V_tcp_delack_enabled || (tp->t_flags & TF_NEEDSYN))) void inline cc_ecnpkt_handler_flags(struct tcpcb *tp, uint16_t flags, uint8_t iptos) { INP_WLOCK_ASSERT(tp->t_inpcb); if (CC_ALGO(tp)->ecnpkt_handler != NULL) { switch (iptos & IPTOS_ECN_MASK) { case IPTOS_ECN_CE: tp->ccv->flags |= CCF_IPHDR_CE; break; case IPTOS_ECN_ECT0: /* FALLTHROUGH */ case IPTOS_ECN_ECT1: /* FALLTHROUGH */ case IPTOS_ECN_NOTECT: tp->ccv->flags &= ~CCF_IPHDR_CE; break; } if (flags & TH_CWR) tp->ccv->flags |= CCF_TCPHDR_CWR; else tp->ccv->flags &= ~CCF_TCPHDR_CWR; CC_ALGO(tp)->ecnpkt_handler(tp->ccv); if (tp->ccv->flags & CCF_ACKNOW) { tcp_timer_activate(tp, TT_DELACK, tcp_delacktime); tp->t_flags |= TF_ACKNOW; } } } void inline cc_ecnpkt_handler(struct tcpcb *tp, struct tcphdr *th, uint8_t iptos) { cc_ecnpkt_handler_flags(tp, th->th_flags, iptos); } /* * TCP input handling is split into multiple parts: * tcp6_input is a thin wrapper around tcp_input for the extended * ip6_protox[] call format in ip6_input * tcp_input handles primary segment validation, inpcb lookup and * SYN processing on listen sockets * tcp_do_segment processes the ACK and text of the segment for * establishing, established and closing connections */ #ifdef INET6 int tcp6_input_with_port(struct mbuf **mp, int *offp, int proto, uint16_t port) { struct mbuf *m; struct in6_ifaddr *ia6; struct ip6_hdr *ip6; m = *mp; if (m->m_len < *offp + sizeof(struct tcphdr)) { m = m_pullup(m, *offp + sizeof(struct tcphdr)); if (m == NULL) { *mp = m; TCPSTAT_INC(tcps_rcvshort); return (IPPROTO_DONE); } } /* * draft-itojun-ipv6-tcp-to-anycast * better place to put this in? */ ip6 = mtod(m, struct ip6_hdr *); ia6 = in6ifa_ifwithaddr(&ip6->ip6_dst, 0 /* XXX */, false); if (ia6 && (ia6->ia6_flags & IN6_IFF_ANYCAST)) { icmp6_error(m, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_ADDR, (caddr_t)&ip6->ip6_dst - (caddr_t)ip6); *mp = NULL; return (IPPROTO_DONE); } *mp = m; return (tcp_input_with_port(mp, offp, proto, port)); } int tcp6_input(struct mbuf **mp, int *offp, int proto) { return(tcp6_input_with_port(mp, offp, proto, 0)); } #endif /* INET6 */ int tcp_input_with_port(struct mbuf **mp, int *offp, int proto, uint16_t port) { struct mbuf *m = *mp; struct tcphdr *th = NULL; struct ip *ip = NULL; struct inpcb *inp = NULL; struct tcpcb *tp = NULL; struct socket *so = NULL; u_char *optp = NULL; int off0; int optlen = 0; #ifdef INET int len; uint8_t ipttl; #endif int tlen = 0, off; int drop_hdrlen; int thflags; int rstreason = 0; /* For badport_bandlim accounting purposes */ int lookupflag; uint8_t iptos; struct m_tag *fwd_tag = NULL; #ifdef INET6 struct ip6_hdr *ip6 = NULL; int isipv6; #else const void *ip6 = NULL; #endif /* INET6 */ struct tcpopt to; /* options in this segment */ char *s = NULL; /* address and port logging */ #ifdef TCPDEBUG /* * The size of tcp_saveipgen must be the size of the max ip header, * now IPv6. */ u_char tcp_saveipgen[IP6_HDR_LEN]; struct tcphdr tcp_savetcp; short ostate = 0; #endif NET_EPOCH_ASSERT(); #ifdef INET6 isipv6 = (mtod(m, struct ip *)->ip_v == 6) ? 1 : 0; #endif off0 = *offp; m = *mp; *mp = NULL; to.to_flags = 0; TCPSTAT_INC(tcps_rcvtotal); #ifdef INET6 if (isipv6) { ip6 = mtod(m, struct ip6_hdr *); th = (struct tcphdr *)((caddr_t)ip6 + off0); tlen = sizeof(*ip6) + ntohs(ip6->ip6_plen) - off0; if (port) goto skip6_csum; if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID_IPV6) { if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) th->th_sum = m->m_pkthdr.csum_data; else th->th_sum = in6_cksum_pseudo(ip6, tlen, IPPROTO_TCP, m->m_pkthdr.csum_data); th->th_sum ^= 0xffff; } else th->th_sum = in6_cksum(m, IPPROTO_TCP, off0, tlen); if (th->th_sum) { TCPSTAT_INC(tcps_rcvbadsum); goto drop; } skip6_csum: /* * Be proactive about unspecified IPv6 address in source. * As we use all-zero to indicate unbounded/unconnected pcb, * unspecified IPv6 address can be used to confuse us. * * Note that packets with unspecified IPv6 destination is * already dropped in ip6_input. */ if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) { /* XXX stat */ goto drop; } iptos = IPV6_TRAFFIC_CLASS(ip6); } #endif #if defined(INET) && defined(INET6) else #endif #ifdef INET { /* * Get IP and TCP header together in first mbuf. * Note: IP leaves IP header in first mbuf. */ if (off0 > sizeof (struct ip)) { ip_stripoptions(m); off0 = sizeof(struct ip); } if (m->m_len < sizeof (struct tcpiphdr)) { if ((m = m_pullup(m, sizeof (struct tcpiphdr))) == NULL) { TCPSTAT_INC(tcps_rcvshort); return (IPPROTO_DONE); } } ip = mtod(m, struct ip *); th = (struct tcphdr *)((caddr_t)ip + off0); tlen = ntohs(ip->ip_len) - off0; iptos = ip->ip_tos; if (port) goto skip_csum; if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) { if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) th->th_sum = m->m_pkthdr.csum_data; else th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htonl(m->m_pkthdr.csum_data + tlen + IPPROTO_TCP)); th->th_sum ^= 0xffff; } else { struct ipovly *ipov = (struct ipovly *)ip; /* * Checksum extended TCP header and data. */ len = off0 + tlen; ipttl = ip->ip_ttl; bzero(ipov->ih_x1, sizeof(ipov->ih_x1)); ipov->ih_len = htons(tlen); th->th_sum = in_cksum(m, len); /* Reset length for SDT probes. */ ip->ip_len = htons(len); /* Reset TOS bits */ ip->ip_tos = iptos; /* Re-initialization for later version check */ ip->ip_ttl = ipttl; ip->ip_v = IPVERSION; ip->ip_hl = off0 >> 2; } skip_csum: if (th->th_sum && (port == 0)) { TCPSTAT_INC(tcps_rcvbadsum); goto drop; } } #endif /* INET */ /* * Check that TCP offset makes sense, * pull out TCP options and adjust length. XXX */ off = th->th_off << 2; if (off < sizeof (struct tcphdr) || off > tlen) { TCPSTAT_INC(tcps_rcvbadoff); goto drop; } tlen -= off; /* tlen is used instead of ti->ti_len */ if (off > sizeof (struct tcphdr)) { #ifdef INET6 if (isipv6) { if (m->m_len < off0 + off) { m = m_pullup(m, off0 + off); if (m == NULL) { TCPSTAT_INC(tcps_rcvshort); return (IPPROTO_DONE); } } ip6 = mtod(m, struct ip6_hdr *); th = (struct tcphdr *)((caddr_t)ip6 + off0); } #endif #if defined(INET) && defined(INET6) else #endif #ifdef INET { if (m->m_len < sizeof(struct ip) + off) { if ((m = m_pullup(m, sizeof (struct ip) + off)) == NULL) { TCPSTAT_INC(tcps_rcvshort); return (IPPROTO_DONE); } ip = mtod(m, struct ip *); th = (struct tcphdr *)((caddr_t)ip + off0); } } #endif optlen = off - sizeof (struct tcphdr); optp = (u_char *)(th + 1); } thflags = th->th_flags; /* * Convert TCP protocol specific fields to host format. */ tcp_fields_to_host(th); /* * Delay dropping TCP, IP headers, IPv6 ext headers, and TCP options. */ drop_hdrlen = off0 + off; /* * Grab info from PACKET_TAG_IPFORWARD tag prepended to the chain. */ if ( #ifdef INET6 (isipv6 && (m->m_flags & M_IP6_NEXTHOP)) #ifdef INET || (!isipv6 && (m->m_flags & M_IP_NEXTHOP)) #endif #endif #if defined(INET) && !defined(INET6) (m->m_flags & M_IP_NEXTHOP) #endif ) fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL); /* * For initial SYN packets we don't need write lock on matching * PCB, be it a listening one or a synchronized one. The packet * shall not modify its state. */ lookupflag = (thflags & (TH_ACK|TH_SYN)) == TH_SYN ? INPLOOKUP_RLOCKPCB : INPLOOKUP_WLOCKPCB; findpcb: #ifdef INET6 if (isipv6 && fwd_tag != NULL) { struct sockaddr_in6 *next_hop6; next_hop6 = (struct sockaddr_in6 *)(fwd_tag + 1); /* * Transparently forwarded. Pretend to be the destination. * Already got one like this? */ inp = in6_pcblookup_mbuf(&V_tcbinfo, &ip6->ip6_src, th->th_sport, &ip6->ip6_dst, th->th_dport, lookupflag, m->m_pkthdr.rcvif, m); if (!inp) { /* * It's new. Try to find the ambushing socket. * Because we've rewritten the destination address, * any hardware-generated hash is ignored. */ inp = in6_pcblookup(&V_tcbinfo, &ip6->ip6_src, th->th_sport, &next_hop6->sin6_addr, next_hop6->sin6_port ? ntohs(next_hop6->sin6_port) : th->th_dport, INPLOOKUP_WILDCARD | lookupflag, m->m_pkthdr.rcvif); } } else if (isipv6) { inp = in6_pcblookup_mbuf(&V_tcbinfo, &ip6->ip6_src, th->th_sport, &ip6->ip6_dst, th->th_dport, INPLOOKUP_WILDCARD | lookupflag, m->m_pkthdr.rcvif, m); } #endif /* INET6 */ #if defined(INET6) && defined(INET) else #endif #ifdef INET if (fwd_tag != NULL) { struct sockaddr_in *next_hop; next_hop = (struct sockaddr_in *)(fwd_tag+1); /* * Transparently forwarded. Pretend to be the destination. * already got one like this? */ inp = in_pcblookup_mbuf(&V_tcbinfo, ip->ip_src, th->th_sport, ip->ip_dst, th->th_dport, lookupflag, m->m_pkthdr.rcvif, m); if (!inp) { /* * It's new. Try to find the ambushing socket. * Because we've rewritten the destination address, * any hardware-generated hash is ignored. */ inp = in_pcblookup(&V_tcbinfo, ip->ip_src, th->th_sport, next_hop->sin_addr, next_hop->sin_port ? ntohs(next_hop->sin_port) : th->th_dport, INPLOOKUP_WILDCARD | lookupflag, m->m_pkthdr.rcvif); } } else inp = in_pcblookup_mbuf(&V_tcbinfo, ip->ip_src, th->th_sport, ip->ip_dst, th->th_dport, INPLOOKUP_WILDCARD | lookupflag, m->m_pkthdr.rcvif, m); #endif /* INET */ /* * If the INPCB does not exist then all data in the incoming * segment is discarded and an appropriate RST is sent back. * XXX MRT Send RST using which routing table? */ if (inp == NULL) { /* * Log communication attempts to ports that are not * in use. */ if ((V_tcp_log_in_vain == 1 && (thflags & TH_SYN)) || V_tcp_log_in_vain == 2) { if ((s = tcp_log_vain(NULL, th, (void *)ip, ip6))) log(LOG_INFO, "%s; %s: Connection attempt " "to closed port\n", s, __func__); } /* * When blackholing do not respond with a RST but * completely ignore the segment and drop it. */ if ((V_blackhole == 1 && (thflags & TH_SYN)) || V_blackhole == 2) goto dropunlock; rstreason = BANDLIM_RST_CLOSEDPORT; goto dropwithreset; } INP_LOCK_ASSERT(inp); /* * While waiting for inp lock during the lookup, another thread * can have dropped the inpcb, in which case we need to loop back * and try to find a new inpcb to deliver to. */ if (inp->inp_flags & INP_DROPPED) { INP_UNLOCK(inp); inp = NULL; goto findpcb; } if ((inp->inp_flowtype == M_HASHTYPE_NONE) && (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) && - ((inp->inp_socket == NULL) || - (inp->inp_socket->so_options & SO_ACCEPTCONN) == 0)) { + ((inp->inp_socket == NULL) || !SOLISTENING(inp->inp_socket))) { inp->inp_flowid = m->m_pkthdr.flowid; inp->inp_flowtype = M_HASHTYPE_GET(m); } #if defined(IPSEC) || defined(IPSEC_SUPPORT) #ifdef INET6 if (isipv6 && IPSEC_ENABLED(ipv6) && IPSEC_CHECK_POLICY(ipv6, m, inp) != 0) { goto dropunlock; } #ifdef INET else #endif #endif /* INET6 */ #ifdef INET if (IPSEC_ENABLED(ipv4) && IPSEC_CHECK_POLICY(ipv4, m, inp) != 0) { goto dropunlock; } #endif /* INET */ #endif /* IPSEC */ /* * Check the minimum TTL for socket. */ if (inp->inp_ip_minttl != 0) { #ifdef INET6 if (isipv6) { if (inp->inp_ip_minttl > ip6->ip6_hlim) goto dropunlock; } else #endif if (inp->inp_ip_minttl > ip->ip_ttl) goto dropunlock; } /* * A previous connection in TIMEWAIT state is supposed to catch stray * or duplicate segments arriving late. If this segment was a * legitimate new connection attempt, the old INPCB gets removed and * we can try again to find a listening socket. */ if (inp->inp_flags & INP_TIMEWAIT) { tcp_dooptions(&to, optp, optlen, (thflags & TH_SYN) ? TO_SYN : 0); /* * NB: tcp_twcheck unlocks the INP and frees the mbuf. */ if (tcp_twcheck(inp, &to, th, m, tlen)) goto findpcb; return (IPPROTO_DONE); } /* * The TCPCB may no longer exist if the connection is winding * down or it is in the CLOSED state. Either way we drop the * segment and send an appropriate response. */ tp = intotcpcb(inp); if (tp == NULL || tp->t_state == TCPS_CLOSED) { rstreason = BANDLIM_RST_CLOSEDPORT; goto dropwithreset; } if ((tp->t_port != port) && (tp->t_state > TCPS_LISTEN)) { rstreason = BANDLIM_RST_CLOSEDPORT; goto dropwithreset; } #ifdef TCP_OFFLOAD if (tp->t_flags & TF_TOE) { tcp_offload_input(tp, m); m = NULL; /* consumed by the TOE driver */ goto dropunlock; } #endif #ifdef MAC if (mac_inpcb_check_deliver(inp, m)) goto dropunlock; #endif so = inp->inp_socket; KASSERT(so != NULL, ("%s: so == NULL", __func__)); #ifdef TCPDEBUG if (so->so_options & SO_DEBUG) { ostate = tp->t_state; #ifdef INET6 if (isipv6) { bcopy((char *)ip6, (char *)tcp_saveipgen, sizeof(*ip6)); } else #endif bcopy((char *)ip, (char *)tcp_saveipgen, sizeof(*ip)); tcp_savetcp = *th; } #endif /* TCPDEBUG */ /* * When the socket is accepting connections (the INPCB is in LISTEN * state) we look into the SYN cache if this is a new connection * attempt or the completion of a previous one. */ - KASSERT(tp->t_state == TCPS_LISTEN || !(so->so_options & SO_ACCEPTCONN), + KASSERT(tp->t_state == TCPS_LISTEN || !SOLISTENING(so), ("%s: so accepting but tp %p not listening", __func__, tp)); - if (tp->t_state == TCPS_LISTEN && (so->so_options & SO_ACCEPTCONN)) { + if (tp->t_state == TCPS_LISTEN && SOLISTENING(so)) { struct in_conninfo inc; bzero(&inc, sizeof(inc)); #ifdef INET6 if (isipv6) { inc.inc_flags |= INC_ISIPV6; if (inp->inp_inc.inc_flags & INC_IPV6MINMTU) inc.inc_flags |= INC_IPV6MINMTU; inc.inc6_faddr = ip6->ip6_src; inc.inc6_laddr = ip6->ip6_dst; } else #endif { inc.inc_faddr = ip->ip_src; inc.inc_laddr = ip->ip_dst; } inc.inc_fport = th->th_sport; inc.inc_lport = th->th_dport; inc.inc_fibnum = so->so_fibnum; /* * Check for an existing connection attempt in syncache if * the flag is only ACK. A successful lookup creates a new * socket appended to the listen queue in SYN_RECEIVED state. */ if ((thflags & (TH_RST|TH_ACK|TH_SYN)) == TH_ACK) { /* * Parse the TCP options here because * syncookies need access to the reflected * timestamp. */ tcp_dooptions(&to, optp, optlen, 0); /* * NB: syncache_expand() doesn't unlock * inp and tcpinfo locks. */ rstreason = syncache_expand(&inc, &to, th, &so, m, port); if (rstreason < 0) { /* * A failing TCP MD5 signature comparison * must result in the segment being dropped * and must not produce any response back * to the sender. */ goto dropunlock; } else if (rstreason == 0) { /* * No syncache entry or ACK was not * for our SYN/ACK. Send a RST. * NB: syncache did its own logging * of the failure cause. */ rstreason = BANDLIM_RST_OPENPORT; goto dropwithreset; } tfo_socket_result: if (so == NULL) { /* * We completed the 3-way handshake * but could not allocate a socket * either due to memory shortage, * listen queue length limits or * global socket limits. Send RST * or wait and have the remote end * retransmit the ACK for another * try. */ if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Listen socket: " "Socket allocation failed due to " "limits or memory shortage, %s\n", s, __func__, V_tcp_sc_rst_sock_fail ? "sending RST" : "try again"); if (V_tcp_sc_rst_sock_fail) { rstreason = BANDLIM_UNLIMITED; goto dropwithreset; } else goto dropunlock; } /* * Socket is created in state SYN_RECEIVED. * Unlock the listen socket, lock the newly * created socket and update the tp variable. * If we came here via jump to tfo_socket_result, * then listening socket is read-locked. */ INP_UNLOCK(inp); /* listen socket */ inp = sotoinpcb(so); /* * New connection inpcb is already locked by * syncache_expand(). */ INP_WLOCK_ASSERT(inp); tp = intotcpcb(inp); KASSERT(tp->t_state == TCPS_SYN_RECEIVED, ("%s: ", __func__)); /* * Process the segment and the data it * contains. tcp_do_segment() consumes * the mbuf chain and unlocks the inpcb. */ TCP_PROBE5(receive, NULL, tp, m, tp, th); tp->t_fb->tfb_tcp_do_segment(m, th, so, tp, drop_hdrlen, tlen, iptos); return (IPPROTO_DONE); } /* * Segment flag validation for new connection attempts: * * Our (SYN|ACK) response was rejected. * Check with syncache and remove entry to prevent * retransmits. * * NB: syncache_chkrst does its own logging of failure * causes. */ if (thflags & TH_RST) { syncache_chkrst(&inc, th, m, port); goto dropunlock; } /* * We can't do anything without SYN. */ if ((thflags & TH_SYN) == 0) { if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Listen socket: " "SYN is missing, segment ignored\n", s, __func__); TCPSTAT_INC(tcps_badsyn); goto dropunlock; } /* * (SYN|ACK) is bogus on a listen socket. */ if (thflags & TH_ACK) { if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Listen socket: " "SYN|ACK invalid, segment rejected\n", s, __func__); syncache_badack(&inc, port); /* XXX: Not needed! */ TCPSTAT_INC(tcps_badsyn); rstreason = BANDLIM_RST_OPENPORT; goto dropwithreset; } /* * If the drop_synfin option is enabled, drop all * segments with both the SYN and FIN bits set. * This prevents e.g. nmap from identifying the * TCP/IP stack. * XXX: Poor reasoning. nmap has other methods * and is constantly refining its stack detection * strategies. * XXX: This is a violation of the TCP specification * and was used by RFC1644. */ if ((thflags & TH_FIN) && V_drop_synfin) { if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Listen socket: " "SYN|FIN segment ignored (based on " "sysctl setting)\n", s, __func__); TCPSTAT_INC(tcps_badsyn); goto dropunlock; } /* * Segment's flags are (SYN) or (SYN|FIN). * * TH_PUSH, TH_URG, TH_ECE, TH_CWR are ignored * as they do not affect the state of the TCP FSM. * The data pointed to by TH_URG and th_urp is ignored. */ KASSERT((thflags & (TH_RST|TH_ACK)) == 0, ("%s: Listen socket: TH_RST or TH_ACK set", __func__)); KASSERT(thflags & (TH_SYN), ("%s: Listen socket: TH_SYN not set", __func__)); INP_RLOCK_ASSERT(inp); #ifdef INET6 /* * If deprecated address is forbidden, * we do not accept SYN to deprecated interface * address to prevent any new inbound connection from * getting established. * When we do not accept SYN, we send a TCP RST, * with deprecated source address (instead of dropping * it). We compromise it as it is much better for peer * to send a RST, and RST will be the final packet * for the exchange. * * If we do not forbid deprecated addresses, we accept * the SYN packet. RFC2462 does not suggest dropping * SYN in this case. * If we decipher RFC2462 5.5.4, it says like this: * 1. use of deprecated addr with existing * communication is okay - "SHOULD continue to be * used" * 2. use of it with new communication: * (2a) "SHOULD NOT be used if alternate address * with sufficient scope is available" * (2b) nothing mentioned otherwise. * Here we fall into (2b) case as we have no choice in * our source address selection - we must obey the peer. * * The wording in RFC2462 is confusing, and there are * multiple description text for deprecated address * handling - worse, they are not exactly the same. * I believe 5.5.4 is the best one, so we follow 5.5.4. */ if (isipv6 && !V_ip6_use_deprecated) { struct in6_ifaddr *ia6; ia6 = in6ifa_ifwithaddr(&ip6->ip6_dst, 0 /* XXX */, false); if (ia6 != NULL && (ia6->ia6_flags & IN6_IFF_DEPRECATED)) { if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Listen socket: " "Connection attempt to deprecated " "IPv6 address rejected\n", s, __func__); rstreason = BANDLIM_RST_OPENPORT; goto dropwithreset; } } #endif /* INET6 */ /* * Basic sanity checks on incoming SYN requests: * Don't respond if the destination is a link layer * broadcast according to RFC1122 4.2.3.10, p. 104. * If it is from this socket it must be forged. * Don't respond if the source or destination is a * global or subnet broad- or multicast address. * Note that it is quite possible to receive unicast * link-layer packets with a broadcast IP address. Use * in_broadcast() to find them. */ if (m->m_flags & (M_BCAST|M_MCAST)) { if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Listen socket: " "Connection attempt from broad- or multicast " "link layer address ignored\n", s, __func__); goto dropunlock; } #ifdef INET6 if (isipv6) { if (th->th_dport == th->th_sport && IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, &ip6->ip6_src)) { if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Listen socket: " "Connection attempt to/from self " "ignored\n", s, __func__); goto dropunlock; } if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) || IN6_IS_ADDR_MULTICAST(&ip6->ip6_src)) { if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Listen socket: " "Connection attempt from/to multicast " "address ignored\n", s, __func__); goto dropunlock; } } #endif #if defined(INET) && defined(INET6) else #endif #ifdef INET { if (th->th_dport == th->th_sport && ip->ip_dst.s_addr == ip->ip_src.s_addr) { if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Listen socket: " "Connection attempt from/to self " "ignored\n", s, __func__); goto dropunlock; } if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) || IN_MULTICAST(ntohl(ip->ip_src.s_addr)) || ip->ip_src.s_addr == htonl(INADDR_BROADCAST) || in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) { if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Listen socket: " "Connection attempt from/to broad- " "or multicast address ignored\n", s, __func__); goto dropunlock; } } #endif /* * SYN appears to be valid. Create compressed TCP state * for syncache. */ #ifdef TCPDEBUG if (so->so_options & SO_DEBUG) tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif TCP_PROBE3(debug__input, tp, th, m); tcp_dooptions(&to, optp, optlen, TO_SYN); if ((so = syncache_add(&inc, &to, th, inp, so, m, NULL, NULL, iptos, port)) != NULL) goto tfo_socket_result; /* * Entry added to syncache and mbuf consumed. * Only the listen socket is unlocked by syncache_add(). */ return (IPPROTO_DONE); } else if (tp->t_state == TCPS_LISTEN) { /* * When a listen socket is torn down the SO_ACCEPTCONN * flag is removed first while connections are drained * from the accept queue in a unlock/lock cycle of the * ACCEPT_LOCK, opening a race condition allowing a SYN * attempt go through unhandled. */ goto dropunlock; } #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) if (tp->t_flags & TF_SIGNATURE) { tcp_dooptions(&to, optp, optlen, thflags); if ((to.to_flags & TOF_SIGNATURE) == 0) { TCPSTAT_INC(tcps_sig_err_nosigopt); goto dropunlock; } if (!TCPMD5_ENABLED() || TCPMD5_INPUT(m, th, to.to_signature) != 0) goto dropunlock; } #endif TCP_PROBE5(receive, NULL, tp, m, tp, th); /* * Segment belongs to a connection in SYN_SENT, ESTABLISHED or later * state. tcp_do_segment() always consumes the mbuf chain, unlocks * the inpcb, and unlocks pcbinfo. * * XXXGL: in case of a pure SYN arriving on existing connection * TCP stacks won't need to modify the PCB, they would either drop * the segment silently, or send a challenge ACK. However, we try * to upgrade the lock, because calling convention for stacks is * write-lock on PCB. If upgrade fails, drop the SYN. */ if (lookupflag == INPLOOKUP_RLOCKPCB && INP_TRY_UPGRADE(inp) == 0) goto dropunlock; tp->t_fb->tfb_tcp_do_segment(m, th, so, tp, drop_hdrlen, tlen, iptos); return (IPPROTO_DONE); dropwithreset: TCP_PROBE5(receive, NULL, tp, m, tp, th); if (inp != NULL) { tcp_dropwithreset(m, th, tp, tlen, rstreason); INP_UNLOCK(inp); } else tcp_dropwithreset(m, th, NULL, tlen, rstreason); m = NULL; /* mbuf chain got consumed. */ goto drop; dropunlock: if (m != NULL) TCP_PROBE5(receive, NULL, tp, m, tp, th); if (inp != NULL) INP_UNLOCK(inp); drop: if (s != NULL) free(s, M_TCPLOG); if (m != NULL) m_freem(m); return (IPPROTO_DONE); } /* * Automatic sizing of receive socket buffer. Often the send * buffer size is not optimally adjusted to the actual network * conditions at hand (delay bandwidth product). Setting the * buffer size too small limits throughput on links with high * bandwidth and high delay (eg. trans-continental/oceanic links). * * On the receive side the socket buffer memory is only rarely * used to any significant extent. This allows us to be much * more aggressive in scaling the receive socket buffer. For * the case that the buffer space is actually used to a large * extent and we run out of kernel memory we can simply drop * the new segments; TCP on the sender will just retransmit it * later. Setting the buffer size too big may only consume too * much kernel memory if the application doesn't read() from * the socket or packet loss or reordering makes use of the * reassembly queue. * * The criteria to step up the receive buffer one notch are: * 1. Application has not set receive buffer size with * SO_RCVBUF. Setting SO_RCVBUF clears SB_AUTOSIZE. * 2. the number of bytes received during 1/2 of an sRTT * is at least 3/8 of the current socket buffer size. * 3. receive buffer size has not hit maximal automatic size; * * If all of the criteria are met we increaset the socket buffer * by a 1/2 (bounded by the max). This allows us to keep ahead * of slow-start but also makes it so our peer never gets limited * by our rwnd which we then open up causing a burst. * * This algorithm does two steps per RTT at most and only if * we receive a bulk stream w/o packet losses or reorderings. * Shrinking the buffer during idle times is not necessary as * it doesn't consume any memory when idle. * * TODO: Only step up if the application is actually serving * the buffer to better manage the socket buffer resources. */ int tcp_autorcvbuf(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, int tlen) { int newsize = 0; if (V_tcp_do_autorcvbuf && (so->so_rcv.sb_flags & SB_AUTOSIZE) && tp->t_srtt != 0 && tp->rfbuf_ts != 0 && TCP_TS_TO_TICKS(tcp_ts_getticks() - tp->rfbuf_ts) > ((tp->t_srtt >> TCP_RTT_SHIFT)/2)) { if (tp->rfbuf_cnt > ((so->so_rcv.sb_hiwat / 2)/ 4 * 3) && so->so_rcv.sb_hiwat < V_tcp_autorcvbuf_max) { newsize = min((so->so_rcv.sb_hiwat + (so->so_rcv.sb_hiwat/2)), V_tcp_autorcvbuf_max); } TCP_PROBE6(receive__autoresize, NULL, tp, m, tp, th, newsize); /* Start over with next RTT. */ tp->rfbuf_ts = 0; tp->rfbuf_cnt = 0; } else { tp->rfbuf_cnt += tlen; /* add up */ } return (newsize); } int tcp_input(struct mbuf **mp, int *offp, int proto) { return(tcp_input_with_port(mp, offp, proto, 0)); } void tcp_handle_wakeup(struct tcpcb *tp, struct socket *so) { /* * Since tp might be gone if the session entered * the TIME_WAIT state before coming here, we need * to check if the socket is still connected. */ if (tp == NULL) { return; } if (so == NULL) { return; } INP_LOCK_ASSERT(tp->t_inpcb); if (tp->t_flags & TF_WAKESOR) { tp->t_flags &= ~TF_WAKESOR; SOCKBUF_LOCK_ASSERT(&so->so_rcv); sorwakeup_locked(so); } } void tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, int drop_hdrlen, int tlen, uint8_t iptos) { int thflags, acked, ourfinisacked, needoutput = 0, sack_changed; int rstreason, todrop, win, incforsyn = 0; uint32_t tiwin; uint16_t nsegs; char *s; struct in_conninfo *inc; struct mbuf *mfree; struct tcpopt to; int tfo_syn; u_int maxseg; #ifdef TCPDEBUG /* * The size of tcp_saveipgen must be the size of the max ip header, * now IPv6. */ u_char tcp_saveipgen[IP6_HDR_LEN]; struct tcphdr tcp_savetcp; short ostate = 0; #endif thflags = th->th_flags; inc = &tp->t_inpcb->inp_inc; tp->sackhint.last_sack_ack = 0; sack_changed = 0; nsegs = max(1, m->m_pkthdr.lro_nsegs); NET_EPOCH_ASSERT(); INP_WLOCK_ASSERT(tp->t_inpcb); KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN", __func__)); KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT", __func__)); #ifdef TCPPCAP /* Save segment, if requested. */ tcp_pcap_add(th, m, &(tp->t_inpkts)); #endif TCP_LOG_EVENT(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_IN, 0, tlen, NULL, true); if ((thflags & TH_SYN) && (thflags & TH_FIN) && V_drop_synfin) { if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: " "SYN|FIN segment ignored (based on " "sysctl setting)\n", s, __func__); free(s, M_TCPLOG); } goto drop; } /* * If a segment with the ACK-bit set arrives in the SYN-SENT state * check SEQ.ACK first. */ if ((tp->t_state == TCPS_SYN_SENT) && (thflags & TH_ACK) && (SEQ_LEQ(th->th_ack, tp->iss) || SEQ_GT(th->th_ack, tp->snd_max))) { rstreason = BANDLIM_UNLIMITED; goto dropwithreset; } /* * Segment received on connection. * Reset idle time and keep-alive timer. * XXX: This should be done after segment * validation to ignore broken/spoofed segs. */ tp->t_rcvtime = ticks; /* * Scale up the window into a 32-bit value. * For the SYN_SENT state the scale is zero. */ tiwin = th->th_win << tp->snd_scale; #ifdef STATS stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_FRWIN, tiwin); #endif /* * TCP ECN processing. */ if (tp->t_flags2 & TF2_ECN_PERMIT) { if (thflags & TH_CWR) { tp->t_flags2 &= ~TF2_ECN_SND_ECE; tp->t_flags |= TF_ACKNOW; } switch (iptos & IPTOS_ECN_MASK) { case IPTOS_ECN_CE: tp->t_flags2 |= TF2_ECN_SND_ECE; TCPSTAT_INC(tcps_ecn_ce); break; case IPTOS_ECN_ECT0: TCPSTAT_INC(tcps_ecn_ect0); break; case IPTOS_ECN_ECT1: TCPSTAT_INC(tcps_ecn_ect1); break; } /* Process a packet differently from RFC3168. */ cc_ecnpkt_handler(tp, th, iptos); /* Congestion experienced. */ if (thflags & TH_ECE) { cc_cong_signal(tp, th, CC_ECN); } } /* * Parse options on any incoming segment. */ tcp_dooptions(&to, (u_char *)(th + 1), (th->th_off << 2) - sizeof(struct tcphdr), (thflags & TH_SYN) ? TO_SYN : 0); #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) if ((tp->t_flags & TF_SIGNATURE) != 0 && (to.to_flags & TOF_SIGNATURE) == 0) { TCPSTAT_INC(tcps_sig_err_sigopt); /* XXX: should drop? */ } #endif /* * If echoed timestamp is later than the current time, * fall back to non RFC1323 RTT calculation. Normalize * timestamp if syncookies were used when this connection * was established. */ if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) { to.to_tsecr -= tp->ts_offset; if (TSTMP_GT(to.to_tsecr, tcp_ts_getticks())) to.to_tsecr = 0; else if (tp->t_flags & TF_PREVVALID && tp->t_badrxtwin != 0 && SEQ_LT(to.to_tsecr, tp->t_badrxtwin)) cc_cong_signal(tp, th, CC_RTO_ERR); } /* * Process options only when we get SYN/ACK back. The SYN case * for incoming connections is handled in tcp_syncache. * According to RFC1323 the window field in a SYN (i.e., a * or ) segment itself is never scaled. * XXX this is traditional behavior, may need to be cleaned up. */ if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) { /* Handle parallel SYN for ECN */ if (!(thflags & TH_ACK) && ((thflags & (TH_CWR | TH_ECE)) == (TH_CWR | TH_ECE)) && ((V_tcp_do_ecn == 1) || (V_tcp_do_ecn == 2))) { tp->t_flags2 |= TF2_ECN_PERMIT; tp->t_flags2 |= TF2_ECN_SND_ECE; TCPSTAT_INC(tcps_ecn_shs); } if ((to.to_flags & TOF_SCALE) && (tp->t_flags & TF_REQ_SCALE) && !(tp->t_flags & TF_NOOPT)) { tp->t_flags |= TF_RCVD_SCALE; tp->snd_scale = to.to_wscale; } else tp->t_flags &= ~TF_REQ_SCALE; /* * Initial send window. It will be updated with * the next incoming segment to the scaled value. */ tp->snd_wnd = th->th_win; if ((to.to_flags & TOF_TS) && (tp->t_flags & TF_REQ_TSTMP) && !(tp->t_flags & TF_NOOPT)) { tp->t_flags |= TF_RCVD_TSTMP; tp->ts_recent = to.to_tsval; tp->ts_recent_age = tcp_ts_getticks(); } else tp->t_flags &= ~TF_REQ_TSTMP; if (to.to_flags & TOF_MSS) tcp_mss(tp, to.to_mss); if ((tp->t_flags & TF_SACK_PERMIT) && (!(to.to_flags & TOF_SACKPERM) || (tp->t_flags & TF_NOOPT))) tp->t_flags &= ~TF_SACK_PERMIT; if (IS_FASTOPEN(tp->t_flags)) { if ((to.to_flags & TOF_FASTOPEN) && !(tp->t_flags & TF_NOOPT)) { uint16_t mss; if (to.to_flags & TOF_MSS) mss = to.to_mss; else if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) mss = TCP6_MSS; else mss = TCP_MSS; tcp_fastopen_update_cache(tp, mss, to.to_tfo_len, to.to_tfo_cookie); } else tcp_fastopen_disable_path(tp); } } /* * If timestamps were negotiated during SYN/ACK and a * segment without a timestamp is received, silently drop * the segment, unless it is a RST segment or missing timestamps are * tolerated. * See section 3.2 of RFC 7323. */ if ((tp->t_flags & TF_RCVD_TSTMP) && !(to.to_flags & TOF_TS)) { if (((thflags & TH_RST) != 0) || V_tcp_tolerate_missing_ts) { if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: Timestamp missing, " "segment processed normally\n", s, __func__); free(s, M_TCPLOG); } } else { if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: Timestamp missing, " "segment silently dropped\n", s, __func__); free(s, M_TCPLOG); } goto drop; } } /* * If timestamps were not negotiated during SYN/ACK and a * segment with a timestamp is received, ignore the * timestamp and process the packet normally. * See section 3.2 of RFC 7323. */ if (!(tp->t_flags & TF_RCVD_TSTMP) && (to.to_flags & TOF_TS)) { if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: Timestamp not expected, " "segment processed normally\n", s, __func__); free(s, M_TCPLOG); } } /* * Header prediction: check for the two common cases * of a uni-directional data xfer. If the packet has * no control flags, is in-sequence, the window didn't * change and we're not retransmitting, it's a * candidate. If the length is zero and the ack moved * forward, we're the sender side of the xfer. Just * free the data acked & wake any higher level process * that was blocked waiting for space. If the length * is non-zero and the ack didn't move, we're the * receiver side. If we're getting packets in-order * (the reassembly queue is empty), add the data to * the socket buffer and note that we need a delayed ack. * Make sure that the hidden state-flags are also off. * Since we check for TCPS_ESTABLISHED first, it can only * be TH_NEEDSYN. */ if (tp->t_state == TCPS_ESTABLISHED && th->th_seq == tp->rcv_nxt && (thflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK && tp->snd_nxt == tp->snd_max && tiwin && tiwin == tp->snd_wnd && ((tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN)) == 0) && SEGQ_EMPTY(tp) && ((to.to_flags & TOF_TS) == 0 || TSTMP_GEQ(to.to_tsval, tp->ts_recent)) ) { /* * If last ACK falls within this segment's sequence numbers, * record the timestamp. * NOTE that the test is modified according to the latest * proposal of the tcplw@cray.com list (Braden 1993/04/26). */ if ((to.to_flags & TOF_TS) != 0 && SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { tp->ts_recent_age = tcp_ts_getticks(); tp->ts_recent = to.to_tsval; } if (tlen == 0) { if (SEQ_GT(th->th_ack, tp->snd_una) && SEQ_LEQ(th->th_ack, tp->snd_max) && !IN_RECOVERY(tp->t_flags) && (to.to_flags & TOF_SACK) == 0 && TAILQ_EMPTY(&tp->snd_holes)) { /* * This is a pure ack for outstanding data. */ TCPSTAT_INC(tcps_predack); /* * "bad retransmit" recovery without timestamps. */ if ((to.to_flags & TOF_TS) == 0 && tp->t_rxtshift == 1 && tp->t_flags & TF_PREVVALID && (int)(ticks - tp->t_badrxtwin) < 0) { cc_cong_signal(tp, th, CC_RTO_ERR); } /* * Recalculate the transmit timer / rtt. * * Some boxes send broken timestamp replies * during the SYN+ACK phase, ignore * timestamps of 0 or we could calculate a * huge RTT and blow up the retransmit timer. */ if ((to.to_flags & TOF_TS) != 0 && to.to_tsecr) { uint32_t t; t = tcp_ts_getticks() - to.to_tsecr; if (!tp->t_rttlow || tp->t_rttlow > t) tp->t_rttlow = t; tcp_xmit_timer(tp, TCP_TS_TO_TICKS(t) + 1); } else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) { if (!tp->t_rttlow || tp->t_rttlow > ticks - tp->t_rtttime) tp->t_rttlow = ticks - tp->t_rtttime; tcp_xmit_timer(tp, ticks - tp->t_rtttime); } acked = BYTES_THIS_ACK(tp, th); #ifdef TCP_HHOOK /* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */ hhook_run_tcp_est_in(tp, th, &to); #endif TCPSTAT_ADD(tcps_rcvackpack, nsegs); TCPSTAT_ADD(tcps_rcvackbyte, acked); sbdrop(&so->so_snd, acked); if (SEQ_GT(tp->snd_una, tp->snd_recover) && SEQ_LEQ(th->th_ack, tp->snd_recover)) tp->snd_recover = th->th_ack - 1; /* * Let the congestion control algorithm update * congestion control related information. This * typically means increasing the congestion * window. */ cc_ack_received(tp, th, nsegs, CC_ACK); tp->snd_una = th->th_ack; /* * Pull snd_wl2 up to prevent seq wrap relative * to th_ack. */ tp->snd_wl2 = th->th_ack; tp->t_dupacks = 0; m_freem(m); /* * If all outstanding data are acked, stop * retransmit timer, otherwise restart timer * using current (possibly backed-off) value. * If process is waiting for space, * wakeup/selwakeup/signal. If data * are ready to send, let tcp_output * decide between more output or persist. */ #ifdef TCPDEBUG if (so->so_options & SO_DEBUG) tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif TCP_PROBE3(debug__input, tp, th, m); if (tp->snd_una == tp->snd_max) tcp_timer_activate(tp, TT_REXMT, 0); else if (!tcp_timer_active(tp, TT_PERSIST)) tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur); sowwakeup(so); if (sbavail(&so->so_snd)) (void) tp->t_fb->tfb_tcp_output(tp); goto check_delack; } } else if (th->th_ack == tp->snd_una && tlen <= sbspace(&so->so_rcv)) { int newsize = 0; /* automatic sockbuf scaling */ /* * This is a pure, in-sequence data packet with * nothing on the reassembly queue and we have enough * buffer space to take it. */ /* Clean receiver SACK report if present */ if ((tp->t_flags & TF_SACK_PERMIT) && tp->rcv_numsacks) tcp_clean_sackreport(tp); TCPSTAT_INC(tcps_preddat); tp->rcv_nxt += tlen; if (tlen && ((tp->t_flags2 & TF2_FBYTES_COMPLETE) == 0) && (tp->t_fbyte_in == 0)) { tp->t_fbyte_in = ticks; if (tp->t_fbyte_in == 0) tp->t_fbyte_in = 1; if (tp->t_fbyte_out && tp->t_fbyte_in) tp->t_flags2 |= TF2_FBYTES_COMPLETE; } /* * Pull snd_wl1 up to prevent seq wrap relative to * th_seq. */ tp->snd_wl1 = th->th_seq; /* * Pull rcv_up up to prevent seq wrap relative to * rcv_nxt. */ tp->rcv_up = tp->rcv_nxt; TCPSTAT_ADD(tcps_rcvpack, nsegs); TCPSTAT_ADD(tcps_rcvbyte, tlen); #ifdef TCPDEBUG if (so->so_options & SO_DEBUG) tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif TCP_PROBE3(debug__input, tp, th, m); newsize = tcp_autorcvbuf(m, th, so, tp, tlen); /* Add data to socket buffer. */ SOCKBUF_LOCK(&so->so_rcv); if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { m_freem(m); } else { /* * Set new socket buffer size. * Give up when limit is reached. */ if (newsize) if (!sbreserve_locked(&so->so_rcv, newsize, so, NULL)) so->so_rcv.sb_flags &= ~SB_AUTOSIZE; m_adj(m, drop_hdrlen); /* delayed header drop */ sbappendstream_locked(&so->so_rcv, m, 0); } /* NB: sorwakeup_locked() does an implicit unlock. */ sorwakeup_locked(so); if (DELAY_ACK(tp, tlen)) { tp->t_flags |= TF_DELACK; } else { tp->t_flags |= TF_ACKNOW; tp->t_fb->tfb_tcp_output(tp); } goto check_delack; } } /* * Calculate amount of space in receive window, * and then do TCP input processing. * Receive window is amount of space in rcv queue, * but not less than advertised window. */ win = sbspace(&so->so_rcv); if (win < 0) win = 0; tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt)); switch (tp->t_state) { /* * If the state is SYN_RECEIVED: * if seg contains an ACK, but not for our SYN/ACK, send a RST. */ case TCPS_SYN_RECEIVED: if ((thflags & TH_ACK) && (SEQ_LEQ(th->th_ack, tp->snd_una) || SEQ_GT(th->th_ack, tp->snd_max))) { rstreason = BANDLIM_RST_OPENPORT; goto dropwithreset; } if (IS_FASTOPEN(tp->t_flags)) { /* * When a TFO connection is in SYN_RECEIVED, the * only valid packets are the initial SYN, a * retransmit/copy of the initial SYN (possibly with * a subset of the original data), a valid ACK, a * FIN, or a RST. */ if ((thflags & (TH_SYN|TH_ACK)) == (TH_SYN|TH_ACK)) { rstreason = BANDLIM_RST_OPENPORT; goto dropwithreset; } else if (thflags & TH_SYN) { /* non-initial SYN is ignored */ if ((tcp_timer_active(tp, TT_DELACK) || tcp_timer_active(tp, TT_REXMT))) goto drop; } else if (!(thflags & (TH_ACK|TH_FIN|TH_RST))) { goto drop; } } break; /* * If the state is SYN_SENT: * if seg contains a RST with valid ACK (SEQ.ACK has already * been verified), then drop the connection. * if seg contains a RST without an ACK, drop the seg. * if seg does not contain SYN, then drop the seg. * Otherwise this is an acceptable SYN segment * initialize tp->rcv_nxt and tp->irs * if seg contains ack then advance tp->snd_una * if seg contains an ECE and ECN support is enabled, the stream * is ECN capable. * if SYN has been acked change to ESTABLISHED else SYN_RCVD state * arrange for segment to be acked (eventually) * continue processing rest of data/controls, beginning with URG */ case TCPS_SYN_SENT: if ((thflags & (TH_ACK|TH_RST)) == (TH_ACK|TH_RST)) { TCP_PROBE5(connect__refused, NULL, tp, m, tp, th); tp = tcp_drop(tp, ECONNREFUSED); } if (thflags & TH_RST) goto drop; if (!(thflags & TH_SYN)) goto drop; tp->irs = th->th_seq; tcp_rcvseqinit(tp); if (thflags & TH_ACK) { int tfo_partial_ack = 0; TCPSTAT_INC(tcps_connects); soisconnected(so); #ifdef MAC mac_socketpeer_set_from_mbuf(m, so); #endif /* Do window scaling on this connection? */ if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == (TF_RCVD_SCALE|TF_REQ_SCALE)) { tp->rcv_scale = tp->request_r_scale; } tp->rcv_adv += min(tp->rcv_wnd, TCP_MAXWIN << tp->rcv_scale); tp->snd_una++; /* SYN is acked */ /* * If not all the data that was sent in the TFO SYN * has been acked, resend the remainder right away. */ if (IS_FASTOPEN(tp->t_flags) && (tp->snd_una != tp->snd_max)) { tp->snd_nxt = th->th_ack; tfo_partial_ack = 1; } /* * If there's data, delay ACK; if there's also a FIN * ACKNOW will be turned on later. */ if (DELAY_ACK(tp, tlen) && tlen != 0 && !tfo_partial_ack) tcp_timer_activate(tp, TT_DELACK, tcp_delacktime); else tp->t_flags |= TF_ACKNOW; if (((thflags & (TH_CWR | TH_ECE)) == TH_ECE) && (V_tcp_do_ecn == 1)) { tp->t_flags2 |= TF2_ECN_PERMIT; TCPSTAT_INC(tcps_ecn_shs); } /* * Received in SYN_SENT[*] state. * Transitions: * SYN_SENT --> ESTABLISHED * SYN_SENT* --> FIN_WAIT_1 */ tp->t_starttime = ticks; if (tp->t_flags & TF_NEEDFIN) { tcp_state_change(tp, TCPS_FIN_WAIT_1); tp->t_flags &= ~TF_NEEDFIN; thflags &= ~TH_SYN; } else { tcp_state_change(tp, TCPS_ESTABLISHED); TCP_PROBE5(connect__established, NULL, tp, m, tp, th); cc_conn_init(tp); tcp_timer_activate(tp, TT_KEEP, TP_KEEPIDLE(tp)); } } else { /* * Received initial SYN in SYN-SENT[*] state => * simultaneous open. * If it succeeds, connection is * half-synchronized. * Otherwise, do 3-way handshake: * SYN-SENT -> SYN-RECEIVED * SYN-SENT* -> SYN-RECEIVED* */ tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN); tcp_timer_activate(tp, TT_REXMT, 0); tcp_state_change(tp, TCPS_SYN_RECEIVED); } INP_WLOCK_ASSERT(tp->t_inpcb); /* * Advance th->th_seq to correspond to first data byte. * If data, trim to stay within window, * dropping FIN if necessary. */ th->th_seq++; if (tlen > tp->rcv_wnd) { todrop = tlen - tp->rcv_wnd; m_adj(m, -todrop); tlen = tp->rcv_wnd; thflags &= ~TH_FIN; TCPSTAT_INC(tcps_rcvpackafterwin); TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop); } tp->snd_wl1 = th->th_seq - 1; tp->rcv_up = th->th_seq; /* * Client side of transaction: already sent SYN and data. * If the remote host used T/TCP to validate the SYN, * our data will be ACK'd; if so, enter normal data segment * processing in the middle of step 5, ack processing. * Otherwise, goto step 6. */ if (thflags & TH_ACK) goto process_ACK; goto step6; /* * If the state is LAST_ACK or CLOSING or TIME_WAIT: * do normal processing. * * NB: Leftover from RFC1644 T/TCP. Cases to be reused later. */ case TCPS_LAST_ACK: case TCPS_CLOSING: break; /* continue normal processing */ } /* * States other than LISTEN or SYN_SENT. * First check the RST flag and sequence number since reset segments * are exempt from the timestamp and connection count tests. This * fixes a bug introduced by the Stevens, vol. 2, p. 960 bugfix * below which allowed reset segments in half the sequence space * to fall though and be processed (which gives forged reset * segments with a random sequence number a 50 percent chance of * killing a connection). * Then check timestamp, if present. * Then check the connection count, if present. * Then check that at least some bytes of segment are within * receive window. If segment begins before rcv_nxt, * drop leading data (and SYN); if nothing left, just ack. */ if (thflags & TH_RST) { /* * RFC5961 Section 3.2 * * - RST drops connection only if SEG.SEQ == RCV.NXT. * - If RST is in window, we send challenge ACK. * * Note: to take into account delayed ACKs, we should * test against last_ack_sent instead of rcv_nxt. * Note 2: we handle special case of closed window, not * covered by the RFC. */ if ((SEQ_GEQ(th->th_seq, tp->last_ack_sent) && SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) || (tp->rcv_wnd == 0 && tp->last_ack_sent == th->th_seq)) { KASSERT(tp->t_state != TCPS_SYN_SENT, ("%s: TH_RST for TCPS_SYN_SENT th %p tp %p", __func__, th, tp)); if (V_tcp_insecure_rst || tp->last_ack_sent == th->th_seq) { TCPSTAT_INC(tcps_drops); /* Drop the connection. */ switch (tp->t_state) { case TCPS_SYN_RECEIVED: so->so_error = ECONNREFUSED; goto close; case TCPS_ESTABLISHED: case TCPS_FIN_WAIT_1: case TCPS_FIN_WAIT_2: case TCPS_CLOSE_WAIT: case TCPS_CLOSING: case TCPS_LAST_ACK: so->so_error = ECONNRESET; close: /* FALLTHROUGH */ default: tp = tcp_close(tp); } } else { TCPSTAT_INC(tcps_badrst); /* Send challenge ACK. */ tcp_respond(tp, mtod(m, void *), th, m, tp->rcv_nxt, tp->snd_nxt, TH_ACK); tp->last_ack_sent = tp->rcv_nxt; m = NULL; } } goto drop; } /* * RFC5961 Section 4.2 * Send challenge ACK for any SYN in synchronized state. */ if ((thflags & TH_SYN) && tp->t_state != TCPS_SYN_SENT && tp->t_state != TCPS_SYN_RECEIVED) { TCPSTAT_INC(tcps_badsyn); if (V_tcp_insecure_syn && SEQ_GEQ(th->th_seq, tp->last_ack_sent) && SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) { tp = tcp_drop(tp, ECONNRESET); rstreason = BANDLIM_UNLIMITED; } else { /* Send challenge ACK. */ tcp_respond(tp, mtod(m, void *), th, m, tp->rcv_nxt, tp->snd_nxt, TH_ACK); tp->last_ack_sent = tp->rcv_nxt; m = NULL; } goto drop; } /* * RFC 1323 PAWS: If we have a timestamp reply on this segment * and it's less than ts_recent, drop it. */ if ((to.to_flags & TOF_TS) != 0 && tp->ts_recent && TSTMP_LT(to.to_tsval, tp->ts_recent)) { /* Check to see if ts_recent is over 24 days old. */ if (tcp_ts_getticks() - tp->ts_recent_age > TCP_PAWS_IDLE) { /* * Invalidate ts_recent. If this segment updates * ts_recent, the age will be reset later and ts_recent * will get a valid value. If it does not, setting * ts_recent to zero will at least satisfy the * requirement that zero be placed in the timestamp * echo reply when ts_recent isn't valid. The * age isn't reset until we get a valid ts_recent * because we don't want out-of-order segments to be * dropped when ts_recent is old. */ tp->ts_recent = 0; } else { TCPSTAT_INC(tcps_rcvduppack); TCPSTAT_ADD(tcps_rcvdupbyte, tlen); TCPSTAT_INC(tcps_pawsdrop); if (tlen) goto dropafterack; goto drop; } } /* * In the SYN-RECEIVED state, validate that the packet belongs to * this connection before trimming the data to fit the receive * window. Check the sequence number versus IRS since we know * the sequence numbers haven't wrapped. This is a partial fix * for the "LAND" DoS attack. */ if (tp->t_state == TCPS_SYN_RECEIVED && SEQ_LT(th->th_seq, tp->irs)) { rstreason = BANDLIM_RST_OPENPORT; goto dropwithreset; } todrop = tp->rcv_nxt - th->th_seq; if (todrop > 0) { if (thflags & TH_SYN) { thflags &= ~TH_SYN; th->th_seq++; if (th->th_urp > 1) th->th_urp--; else thflags &= ~TH_URG; todrop--; } /* * Following if statement from Stevens, vol. 2, p. 960. */ if (todrop > tlen || (todrop == tlen && (thflags & TH_FIN) == 0)) { /* * Any valid FIN must be to the left of the window. * At this point the FIN must be a duplicate or out * of sequence; drop it. */ thflags &= ~TH_FIN; /* * Send an ACK to resynchronize and drop any data. * But keep on processing for RST or ACK. */ tp->t_flags |= TF_ACKNOW; todrop = tlen; TCPSTAT_INC(tcps_rcvduppack); TCPSTAT_ADD(tcps_rcvdupbyte, todrop); } else { TCPSTAT_INC(tcps_rcvpartduppack); TCPSTAT_ADD(tcps_rcvpartdupbyte, todrop); } /* * DSACK - add SACK block for dropped range */ if ((todrop > 0) && (tp->t_flags & TF_SACK_PERMIT)) { tcp_update_sack_list(tp, th->th_seq, th->th_seq + todrop); /* * ACK now, as the next in-sequence segment * will clear the DSACK block again */ tp->t_flags |= TF_ACKNOW; } drop_hdrlen += todrop; /* drop from the top afterwards */ th->th_seq += todrop; tlen -= todrop; if (th->th_urp > todrop) th->th_urp -= todrop; else { thflags &= ~TH_URG; th->th_urp = 0; } } /* * If new data are received on a connection after the * user processes are gone, then RST the other end. */ if ((so->so_state & SS_NOFDREF) && tp->t_state > TCPS_CLOSE_WAIT && tlen) { if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: %s: Received %d bytes of data " "after socket was closed, " "sending RST and removing tcpcb\n", s, __func__, tcpstates[tp->t_state], tlen); free(s, M_TCPLOG); } tp = tcp_close(tp); TCPSTAT_INC(tcps_rcvafterclose); rstreason = BANDLIM_UNLIMITED; goto dropwithreset; } /* * If segment ends after window, drop trailing data * (and PUSH and FIN); if nothing left, just ACK. */ todrop = (th->th_seq + tlen) - (tp->rcv_nxt + tp->rcv_wnd); if (todrop > 0) { TCPSTAT_INC(tcps_rcvpackafterwin); if (todrop >= tlen) { TCPSTAT_ADD(tcps_rcvbyteafterwin, tlen); /* * If window is closed can only take segments at * window edge, and have to drop data and PUSH from * incoming segments. Continue processing, but * remember to ack. Otherwise, drop segment * and ack. */ if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) { tp->t_flags |= TF_ACKNOW; TCPSTAT_INC(tcps_rcvwinprobe); } else goto dropafterack; } else TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop); m_adj(m, -todrop); tlen -= todrop; thflags &= ~(TH_PUSH|TH_FIN); } /* * If last ACK falls within this segment's sequence numbers, * record its timestamp. * NOTE: * 1) That the test incorporates suggestions from the latest * proposal of the tcplw@cray.com list (Braden 1993/04/26). * 2) That updating only on newer timestamps interferes with * our earlier PAWS tests, so this check should be solely * predicated on the sequence space of this segment. * 3) That we modify the segment boundary check to be * Last.ACK.Sent <= SEG.SEQ + SEG.Len * instead of RFC1323's * Last.ACK.Sent < SEG.SEQ + SEG.Len, * This modified check allows us to overcome RFC1323's * limitations as described in Stevens TCP/IP Illustrated * Vol. 2 p.869. In such cases, we can still calculate the * RTT correctly when RCV.NXT == Last.ACK.Sent. */ if ((to.to_flags & TOF_TS) != 0 && SEQ_LEQ(th->th_seq, tp->last_ack_sent) && SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + ((thflags & (TH_SYN|TH_FIN)) != 0))) { tp->ts_recent_age = tcp_ts_getticks(); tp->ts_recent = to.to_tsval; } /* * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN * flag is on (half-synchronized state), then queue data for * later processing; else drop segment and return. */ if ((thflags & TH_ACK) == 0) { if (tp->t_state == TCPS_SYN_RECEIVED || (tp->t_flags & TF_NEEDSYN)) { if (tp->t_state == TCPS_SYN_RECEIVED && IS_FASTOPEN(tp->t_flags)) { tp->snd_wnd = tiwin; cc_conn_init(tp); } goto step6; } else if (tp->t_flags & TF_ACKNOW) goto dropafterack; else goto drop; } /* * Ack processing. */ switch (tp->t_state) { /* * In SYN_RECEIVED state, the ack ACKs our SYN, so enter * ESTABLISHED state and continue processing. * The ACK was checked above. */ case TCPS_SYN_RECEIVED: TCPSTAT_INC(tcps_connects); soisconnected(so); /* Do window scaling? */ if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == (TF_RCVD_SCALE|TF_REQ_SCALE)) { tp->rcv_scale = tp->request_r_scale; } tp->snd_wnd = tiwin; /* * Make transitions: * SYN-RECEIVED -> ESTABLISHED * SYN-RECEIVED* -> FIN-WAIT-1 */ tp->t_starttime = ticks; if (IS_FASTOPEN(tp->t_flags) && tp->t_tfo_pending) { tcp_fastopen_decrement_counter(tp->t_tfo_pending); tp->t_tfo_pending = NULL; } if (tp->t_flags & TF_NEEDFIN) { tcp_state_change(tp, TCPS_FIN_WAIT_1); tp->t_flags &= ~TF_NEEDFIN; } else { tcp_state_change(tp, TCPS_ESTABLISHED); TCP_PROBE5(accept__established, NULL, tp, m, tp, th); /* * TFO connections call cc_conn_init() during SYN * processing. Calling it again here for such * connections is not harmless as it would undo the * snd_cwnd reduction that occurs when a TFO SYN|ACK * is retransmitted. */ if (!IS_FASTOPEN(tp->t_flags)) cc_conn_init(tp); tcp_timer_activate(tp, TT_KEEP, TP_KEEPIDLE(tp)); } /* * Account for the ACK of our SYN prior to * regular ACK processing below, except for * simultaneous SYN, which is handled later. */ if (SEQ_GT(th->th_ack, tp->snd_una) && !(tp->t_flags & TF_NEEDSYN)) incforsyn = 1; /* * If segment contains data or ACK, will call tcp_reass() * later; if not, do so now to pass queued data to user. */ if (tlen == 0 && (thflags & TH_FIN) == 0) { (void) tcp_reass(tp, (struct tcphdr *)0, NULL, 0, (struct mbuf *)0); tcp_handle_wakeup(tp, so); } tp->snd_wl1 = th->th_seq - 1; /* FALLTHROUGH */ /* * In ESTABLISHED state: drop duplicate ACKs; ACK out of range * ACKs. If the ack is in the range * tp->snd_una < th->th_ack <= tp->snd_max * then advance tp->snd_una to th->th_ack and drop * data from the retransmission queue. If this ACK reflects * more up to date window information we update our window information. */ case TCPS_ESTABLISHED: case TCPS_FIN_WAIT_1: case TCPS_FIN_WAIT_2: case TCPS_CLOSE_WAIT: case TCPS_CLOSING: case TCPS_LAST_ACK: if (SEQ_GT(th->th_ack, tp->snd_max)) { TCPSTAT_INC(tcps_rcvacktoomuch); goto dropafterack; } if ((tp->t_flags & TF_SACK_PERMIT) && ((to.to_flags & TOF_SACK) || !TAILQ_EMPTY(&tp->snd_holes))) { if (((sack_changed = tcp_sack_doack(tp, &to, th->th_ack)) != 0) && (tp->t_flags & TF_LRD)) { tcp_sack_lost_retransmission(tp, th); } } else /* * Reset the value so that previous (valid) value * from the last ack with SACK doesn't get used. */ tp->sackhint.sacked_bytes = 0; #ifdef TCP_HHOOK /* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */ hhook_run_tcp_est_in(tp, th, &to); #endif if (SEQ_LEQ(th->th_ack, tp->snd_una)) { maxseg = tcp_maxseg(tp); if (tlen == 0 && (tiwin == tp->snd_wnd || (tp->t_flags & TF_SACK_PERMIT))) { /* * If this is the first time we've seen a * FIN from the remote, this is not a * duplicate and it needs to be processed * normally. This happens during a * simultaneous close. */ if ((thflags & TH_FIN) && (TCPS_HAVERCVDFIN(tp->t_state) == 0)) { tp->t_dupacks = 0; break; } TCPSTAT_INC(tcps_rcvdupack); /* * If we have outstanding data (other than * a window probe), this is a completely * duplicate ack (ie, window info didn't * change and FIN isn't set), * the ack is the biggest we've * seen and we've seen exactly our rexmt * threshold of them, assume a packet * has been dropped and retransmit it. * Kludge snd_nxt & the congestion * window so we send only this one * packet. * * We know we're losing at the current * window size so do congestion avoidance * (set ssthresh to half the current window * and pull our congestion window back to * the new ssthresh). * * Dup acks mean that packets have left the * network (they're now cached at the receiver) * so bump cwnd by the amount in the receiver * to keep a constant cwnd packets in the * network. * * When using TCP ECN, notify the peer that * we reduced the cwnd. */ /* * Following 2 kinds of acks should not affect * dupack counting: * 1) Old acks * 2) Acks with SACK but without any new SACK * information in them. These could result from * any anomaly in the network like a switch * duplicating packets or a possible DoS attack. */ if (th->th_ack != tp->snd_una || ((tp->t_flags & TF_SACK_PERMIT) && (to.to_flags & TOF_SACK) && !sack_changed)) break; else if (!tcp_timer_active(tp, TT_REXMT)) tp->t_dupacks = 0; else if (++tp->t_dupacks > tcprexmtthresh || IN_FASTRECOVERY(tp->t_flags)) { cc_ack_received(tp, th, nsegs, CC_DUPACK); if (V_tcp_do_prr && IN_FASTRECOVERY(tp->t_flags) && (tp->t_flags & TF_SACK_PERMIT)) { tcp_do_prr_ack(tp, th, &to); } else if ((tp->t_flags & TF_SACK_PERMIT) && (to.to_flags & TOF_SACK) && IN_FASTRECOVERY(tp->t_flags)) { int awnd; /* * Compute the amount of data in flight first. * We can inject new data into the pipe iff * we have less than 1/2 the original window's * worth of data in flight. */ if (V_tcp_do_newsack) awnd = tcp_compute_pipe(tp); else awnd = (tp->snd_nxt - tp->snd_fack) + tp->sackhint.sack_bytes_rexmit; if (awnd < tp->snd_ssthresh) { tp->snd_cwnd += maxseg; if (tp->snd_cwnd > tp->snd_ssthresh) tp->snd_cwnd = tp->snd_ssthresh; } } else tp->snd_cwnd += maxseg; (void) tp->t_fb->tfb_tcp_output(tp); goto drop; } else if (tp->t_dupacks == tcprexmtthresh || (tp->t_flags & TF_SACK_PERMIT && V_tcp_do_newsack && tp->sackhint.sacked_bytes > (tcprexmtthresh - 1) * maxseg)) { enter_recovery: /* * Above is the RFC6675 trigger condition of * more than (dupthresh-1)*maxseg sacked data. * If the count of holes in the * scoreboard is >= dupthresh, we could * also enter loss recovery, but don't * have that value readily available. */ tp->t_dupacks = tcprexmtthresh; tcp_seq onxt = tp->snd_nxt; /* * If we're doing sack, or prr, check * to see if we're already in sack * recovery. If we're not doing sack, * check to see if we're in newreno * recovery. */ if (V_tcp_do_prr || (tp->t_flags & TF_SACK_PERMIT)) { if (IN_FASTRECOVERY(tp->t_flags)) { tp->t_dupacks = 0; break; } } else { if (SEQ_LEQ(th->th_ack, tp->snd_recover)) { tp->t_dupacks = 0; break; } } /* Congestion signal before ack. */ cc_cong_signal(tp, th, CC_NDUPACK); cc_ack_received(tp, th, nsegs, CC_DUPACK); tcp_timer_activate(tp, TT_REXMT, 0); tp->t_rtttime = 0; if (V_tcp_do_prr) { /* * snd_ssthresh is already updated by * cc_cong_signal. */ tp->sackhint.prr_delivered = tp->sackhint.sacked_bytes; tp->sackhint.recover_fs = max(1, tp->snd_nxt - tp->snd_una); } if ((tp->t_flags & TF_SACK_PERMIT) && (to.to_flags & TOF_SACK)) { TCPSTAT_INC( tcps_sack_recovery_episode); tp->snd_recover = tp->snd_nxt; tp->snd_cwnd = maxseg; (void) tp->t_fb->tfb_tcp_output(tp); if (SEQ_GT(th->th_ack, tp->snd_una)) goto resume_partialack; goto drop; } tp->snd_nxt = th->th_ack; tp->snd_cwnd = maxseg; (void) tp->t_fb->tfb_tcp_output(tp); KASSERT(tp->snd_limited <= 2, ("%s: tp->snd_limited too big", __func__)); tp->snd_cwnd = tp->snd_ssthresh + maxseg * (tp->t_dupacks - tp->snd_limited); if (SEQ_GT(onxt, tp->snd_nxt)) tp->snd_nxt = onxt; goto drop; } else if (V_tcp_do_rfc3042) { /* * Process first and second duplicate * ACKs. Each indicates a segment * leaving the network, creating room * for more. Make sure we can send a * packet on reception of each duplicate * ACK by increasing snd_cwnd by one * segment. Restore the original * snd_cwnd after packet transmission. */ cc_ack_received(tp, th, nsegs, CC_DUPACK); uint32_t oldcwnd = tp->snd_cwnd; tcp_seq oldsndmax = tp->snd_max; u_int sent; int avail; KASSERT(tp->t_dupacks == 1 || tp->t_dupacks == 2, ("%s: dupacks not 1 or 2", __func__)); if (tp->t_dupacks == 1) tp->snd_limited = 0; tp->snd_cwnd = (tp->snd_nxt - tp->snd_una) + (tp->t_dupacks - tp->snd_limited) * maxseg; /* * Only call tcp_output when there * is new data available to be sent. * Otherwise we would send pure ACKs. */ SOCKBUF_LOCK(&so->so_snd); avail = sbavail(&so->so_snd) - (tp->snd_nxt - tp->snd_una); SOCKBUF_UNLOCK(&so->so_snd); if (avail > 0) (void) tp->t_fb->tfb_tcp_output(tp); sent = tp->snd_max - oldsndmax; if (sent > maxseg) { KASSERT((tp->t_dupacks == 2 && tp->snd_limited == 0) || (sent == maxseg + 1 && tp->t_flags & TF_SENTFIN), ("%s: sent too much", __func__)); tp->snd_limited = 2; } else if (sent > 0) ++tp->snd_limited; tp->snd_cwnd = oldcwnd; goto drop; } } break; } else { /* * This ack is advancing the left edge, reset the * counter. */ tp->t_dupacks = 0; /* * If this ack also has new SACK info, increment the * counter as per rfc6675. The variable * sack_changed tracks all changes to the SACK * scoreboard, including when partial ACKs without * SACK options are received, and clear the scoreboard * from the left side. Such partial ACKs should not be * counted as dupacks here. */ if ((tp->t_flags & TF_SACK_PERMIT) && (to.to_flags & TOF_SACK) && sack_changed) { tp->t_dupacks++; /* limit overhead by setting maxseg last */ if (!IN_FASTRECOVERY(tp->t_flags) && (tp->sackhint.sacked_bytes > ((tcprexmtthresh - 1) * (maxseg = tcp_maxseg(tp))))) { goto enter_recovery; } } } resume_partialack: KASSERT(SEQ_GT(th->th_ack, tp->snd_una), ("%s: th_ack <= snd_una", __func__)); /* * If the congestion window was inflated to account * for the other side's cached packets, retract it. */ if (IN_FASTRECOVERY(tp->t_flags)) { if (SEQ_LT(th->th_ack, tp->snd_recover)) { if (tp->t_flags & TF_SACK_PERMIT) if (V_tcp_do_prr && to.to_flags & TOF_SACK) { tcp_timer_activate(tp, TT_REXMT, 0); tp->t_rtttime = 0; tcp_do_prr_ack(tp, th, &to); tp->t_flags |= TF_ACKNOW; (void) tcp_output(tp); } else tcp_sack_partialack(tp, th); else tcp_newreno_partial_ack(tp, th); } else cc_post_recovery(tp, th); } else if (IN_CONGRECOVERY(tp->t_flags)) { if (SEQ_LT(th->th_ack, tp->snd_recover)) { if (V_tcp_do_prr) { tp->sackhint.delivered_data = BYTES_THIS_ACK(tp, th); tp->snd_fack = th->th_ack; tcp_do_prr_ack(tp, th, &to); (void) tcp_output(tp); } } else cc_post_recovery(tp, th); } /* * If we reach this point, ACK is not a duplicate, * i.e., it ACKs something we sent. */ if (tp->t_flags & TF_NEEDSYN) { /* * T/TCP: Connection was half-synchronized, and our * SYN has been ACK'd (so connection is now fully * synchronized). Go to non-starred state, * increment snd_una for ACK of SYN, and check if * we can do window scaling. */ tp->t_flags &= ~TF_NEEDSYN; tp->snd_una++; /* Do window scaling? */ if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == (TF_RCVD_SCALE|TF_REQ_SCALE)) { tp->rcv_scale = tp->request_r_scale; /* Send window already scaled. */ } } process_ACK: INP_WLOCK_ASSERT(tp->t_inpcb); /* * Adjust for the SYN bit in sequence space, * but don't account for it in cwnd calculations. * This is for the SYN_RECEIVED, non-simultaneous * SYN case. SYN_SENT and simultaneous SYN are * treated elsewhere. */ if (incforsyn) tp->snd_una++; acked = BYTES_THIS_ACK(tp, th); KASSERT(acked >= 0, ("%s: acked unexepectedly negative " "(tp->snd_una=%u, th->th_ack=%u, tp=%p, m=%p)", __func__, tp->snd_una, th->th_ack, tp, m)); TCPSTAT_ADD(tcps_rcvackpack, nsegs); TCPSTAT_ADD(tcps_rcvackbyte, acked); /* * If we just performed our first retransmit, and the ACK * arrives within our recovery window, then it was a mistake * to do the retransmit in the first place. Recover our * original cwnd and ssthresh, and proceed to transmit where * we left off. */ if (tp->t_rxtshift == 1 && tp->t_flags & TF_PREVVALID && tp->t_badrxtwin && SEQ_LT(to.to_tsecr, tp->t_badrxtwin)) cc_cong_signal(tp, th, CC_RTO_ERR); /* * If we have a timestamp reply, update smoothed * round trip time. If no timestamp is present but * transmit timer is running and timed sequence * number was acked, update smoothed round trip time. * Since we now have an rtt measurement, cancel the * timer backoff (cf., Phil Karn's retransmit alg.). * Recompute the initial retransmit timer. * * Some boxes send broken timestamp replies * during the SYN+ACK phase, ignore * timestamps of 0 or we could calculate a * huge RTT and blow up the retransmit timer. */ if ((to.to_flags & TOF_TS) != 0 && to.to_tsecr) { uint32_t t; t = tcp_ts_getticks() - to.to_tsecr; if (!tp->t_rttlow || tp->t_rttlow > t) tp->t_rttlow = t; tcp_xmit_timer(tp, TCP_TS_TO_TICKS(t) + 1); } else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) { if (!tp->t_rttlow || tp->t_rttlow > ticks - tp->t_rtttime) tp->t_rttlow = ticks - tp->t_rtttime; tcp_xmit_timer(tp, ticks - tp->t_rtttime); } /* * If all outstanding data is acked, stop retransmit * timer and remember to restart (more output or persist). * If there is more data to be acked, restart retransmit * timer, using current (possibly backed-off) value. */ if (th->th_ack == tp->snd_max) { tcp_timer_activate(tp, TT_REXMT, 0); needoutput = 1; } else if (!tcp_timer_active(tp, TT_PERSIST)) tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur); /* * If no data (only SYN) was ACK'd, * skip rest of ACK processing. */ if (acked == 0) goto step6; /* * Let the congestion control algorithm update congestion * control related information. This typically means increasing * the congestion window. */ cc_ack_received(tp, th, nsegs, CC_ACK); SOCKBUF_LOCK(&so->so_snd); if (acked > sbavail(&so->so_snd)) { if (tp->snd_wnd >= sbavail(&so->so_snd)) tp->snd_wnd -= sbavail(&so->so_snd); else tp->snd_wnd = 0; mfree = sbcut_locked(&so->so_snd, (int)sbavail(&so->so_snd)); ourfinisacked = 1; } else { mfree = sbcut_locked(&so->so_snd, acked); if (tp->snd_wnd >= (uint32_t) acked) tp->snd_wnd -= acked; else tp->snd_wnd = 0; ourfinisacked = 0; } /* NB: sowwakeup_locked() does an implicit unlock. */ sowwakeup_locked(so); m_freem(mfree); /* Detect una wraparound. */ if (!IN_RECOVERY(tp->t_flags) && SEQ_GT(tp->snd_una, tp->snd_recover) && SEQ_LEQ(th->th_ack, tp->snd_recover)) tp->snd_recover = th->th_ack - 1; /* XXXLAS: Can this be moved up into cc_post_recovery? */ if (IN_RECOVERY(tp->t_flags) && SEQ_GEQ(th->th_ack, tp->snd_recover)) { EXIT_RECOVERY(tp->t_flags); } tp->snd_una = th->th_ack; if (tp->t_flags & TF_SACK_PERMIT) { if (SEQ_GT(tp->snd_una, tp->snd_recover)) tp->snd_recover = tp->snd_una; } if (SEQ_LT(tp->snd_nxt, tp->snd_una)) tp->snd_nxt = tp->snd_una; switch (tp->t_state) { /* * In FIN_WAIT_1 STATE in addition to the processing * for the ESTABLISHED state if our FIN is now acknowledged * then enter FIN_WAIT_2. */ case TCPS_FIN_WAIT_1: if (ourfinisacked) { /* * If we can't receive any more * data, then closing user can proceed. * Starting the timer is contrary to the * specification, but if we don't get a FIN * we'll hang forever. * * XXXjl: * we should release the tp also, and use a * compressed state. */ if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { soisdisconnected(so); tcp_timer_activate(tp, TT_2MSL, (tcp_fast_finwait2_recycle ? tcp_finwait2_timeout : TP_MAXIDLE(tp))); } tcp_state_change(tp, TCPS_FIN_WAIT_2); } break; /* * In CLOSING STATE in addition to the processing for * the ESTABLISHED state if the ACK acknowledges our FIN * then enter the TIME-WAIT state, otherwise ignore * the segment. */ case TCPS_CLOSING: if (ourfinisacked) { tcp_twstart(tp); m_freem(m); return; } break; /* * In LAST_ACK, we may still be waiting for data to drain * and/or to be acked, as well as for the ack of our FIN. * If our FIN is now acknowledged, delete the TCB, * enter the closed state and return. */ case TCPS_LAST_ACK: if (ourfinisacked) { tp = tcp_close(tp); goto drop; } break; } } step6: INP_WLOCK_ASSERT(tp->t_inpcb); /* * Update window information. * Don't look at window if no ACK: TAC's send garbage on first SYN. */ if ((thflags & TH_ACK) && (SEQ_LT(tp->snd_wl1, th->th_seq) || (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) || (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) { /* keep track of pure window updates */ if (tlen == 0 && tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) TCPSTAT_INC(tcps_rcvwinupd); tp->snd_wnd = tiwin; tp->snd_wl1 = th->th_seq; tp->snd_wl2 = th->th_ack; if (tp->snd_wnd > tp->max_sndwnd) tp->max_sndwnd = tp->snd_wnd; needoutput = 1; } /* * Process segments with URG. */ if ((thflags & TH_URG) && th->th_urp && TCPS_HAVERCVDFIN(tp->t_state) == 0) { /* * This is a kludge, but if we receive and accept * random urgent pointers, we'll crash in * soreceive. It's hard to imagine someone * actually wanting to send this much urgent data. */ SOCKBUF_LOCK(&so->so_rcv); if (th->th_urp + sbavail(&so->so_rcv) > sb_max) { th->th_urp = 0; /* XXX */ thflags &= ~TH_URG; /* XXX */ SOCKBUF_UNLOCK(&so->so_rcv); /* XXX */ goto dodata; /* XXX */ } /* * If this segment advances the known urgent pointer, * then mark the data stream. This should not happen * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since * a FIN has been received from the remote side. * In these states we ignore the URG. * * According to RFC961 (Assigned Protocols), * the urgent pointer points to the last octet * of urgent data. We continue, however, * to consider it to indicate the first octet * of data past the urgent section as the original * spec states (in one of two places). */ if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) { tp->rcv_up = th->th_seq + th->th_urp; so->so_oobmark = sbavail(&so->so_rcv) + (tp->rcv_up - tp->rcv_nxt) - 1; if (so->so_oobmark == 0) so->so_rcv.sb_state |= SBS_RCVATMARK; sohasoutofband(so); tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA); } SOCKBUF_UNLOCK(&so->so_rcv); /* * Remove out of band data so doesn't get presented to user. * This can happen independent of advancing the URG pointer, * but if two URG's are pending at once, some out-of-band * data may creep in... ick. */ if (th->th_urp <= (uint32_t)tlen && !(so->so_options & SO_OOBINLINE)) { /* hdr drop is delayed */ tcp_pulloutofband(so, th, m, drop_hdrlen); } } else { /* * If no out of band data is expected, * pull receive urgent pointer along * with the receive window. */ if (SEQ_GT(tp->rcv_nxt, tp->rcv_up)) tp->rcv_up = tp->rcv_nxt; } dodata: /* XXX */ INP_WLOCK_ASSERT(tp->t_inpcb); /* * Process the segment text, merging it into the TCP sequencing queue, * and arranging for acknowledgment of receipt if necessary. * This process logically involves adjusting tp->rcv_wnd as data * is presented to the user (this happens in tcp_usrreq.c, * case PRU_RCVD). If a FIN has already been received on this * connection then we just ignore the text. */ tfo_syn = ((tp->t_state == TCPS_SYN_RECEIVED) && IS_FASTOPEN(tp->t_flags)); if ((tlen || (thflags & TH_FIN) || (tfo_syn && tlen > 0)) && TCPS_HAVERCVDFIN(tp->t_state) == 0) { tcp_seq save_start = th->th_seq; tcp_seq save_rnxt = tp->rcv_nxt; int save_tlen = tlen; m_adj(m, drop_hdrlen); /* delayed header drop */ /* * Insert segment which includes th into TCP reassembly queue * with control block tp. Set thflags to whether reassembly now * includes a segment with FIN. This handles the common case * inline (segment is the next to be received on an established * connection, and the queue is empty), avoiding linkage into * and removal from the queue and repetition of various * conversions. * Set DELACK for segments received in order, but ack * immediately when segments are out of order (so * fast retransmit can work). */ if (th->th_seq == tp->rcv_nxt && SEGQ_EMPTY(tp) && (TCPS_HAVEESTABLISHED(tp->t_state) || tfo_syn)) { if (DELAY_ACK(tp, tlen) || tfo_syn) tp->t_flags |= TF_DELACK; else tp->t_flags |= TF_ACKNOW; tp->rcv_nxt += tlen; if (tlen && ((tp->t_flags2 & TF2_FBYTES_COMPLETE) == 0) && (tp->t_fbyte_in == 0)) { tp->t_fbyte_in = ticks; if (tp->t_fbyte_in == 0) tp->t_fbyte_in = 1; if (tp->t_fbyte_out && tp->t_fbyte_in) tp->t_flags2 |= TF2_FBYTES_COMPLETE; } thflags = th->th_flags & TH_FIN; TCPSTAT_INC(tcps_rcvpack); TCPSTAT_ADD(tcps_rcvbyte, tlen); SOCKBUF_LOCK(&so->so_rcv); if (so->so_rcv.sb_state & SBS_CANTRCVMORE) m_freem(m); else sbappendstream_locked(&so->so_rcv, m, 0); tp->t_flags |= TF_WAKESOR; } else { /* * XXX: Due to the header drop above "th" is * theoretically invalid by now. Fortunately * m_adj() doesn't actually frees any mbufs * when trimming from the head. */ tcp_seq temp = save_start; thflags = tcp_reass(tp, th, &temp, &tlen, m); tp->t_flags |= TF_ACKNOW; } if ((tp->t_flags & TF_SACK_PERMIT) && (save_tlen > 0) && TCPS_HAVEESTABLISHED(tp->t_state)) { if ((tlen == 0) && (SEQ_LT(save_start, save_rnxt))) { /* * DSACK actually handled in the fastpath * above. */ tcp_update_sack_list(tp, save_start, save_start + save_tlen); } else if ((tlen > 0) && SEQ_GT(tp->rcv_nxt, save_rnxt)) { if ((tp->rcv_numsacks >= 1) && (tp->sackblks[0].end == save_start)) { /* * Partial overlap, recorded at todrop * above. */ tcp_update_sack_list(tp, tp->sackblks[0].start, tp->sackblks[0].end); } else { tcp_update_dsack_list(tp, save_start, save_start + save_tlen); } } else if (tlen >= save_tlen) { /* Update of sackblks. */ tcp_update_dsack_list(tp, save_start, save_start + save_tlen); } else if (tlen > 0) { tcp_update_dsack_list(tp, save_start, save_start + tlen); } } tcp_handle_wakeup(tp, so); #if 0 /* * Note the amount of data that peer has sent into * our window, in order to estimate the sender's * buffer size. * XXX: Unused. */ if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt)) len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt); else len = so->so_rcv.sb_hiwat; #endif } else { m_freem(m); thflags &= ~TH_FIN; } /* * If FIN is received ACK the FIN and let the user know * that the connection is closing. */ if (thflags & TH_FIN) { if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { /* The socket upcall is handled by socantrcvmore. */ socantrcvmore(so); /* * If connection is half-synchronized * (ie NEEDSYN flag on) then delay ACK, * so it may be piggybacked when SYN is sent. * Otherwise, since we received a FIN then no * more input can be expected, send ACK now. */ if (tp->t_flags & TF_NEEDSYN) tp->t_flags |= TF_DELACK; else tp->t_flags |= TF_ACKNOW; tp->rcv_nxt++; } switch (tp->t_state) { /* * In SYN_RECEIVED and ESTABLISHED STATES * enter the CLOSE_WAIT state. */ case TCPS_SYN_RECEIVED: tp->t_starttime = ticks; /* FALLTHROUGH */ case TCPS_ESTABLISHED: tcp_state_change(tp, TCPS_CLOSE_WAIT); break; /* * If still in FIN_WAIT_1 STATE FIN has not been acked so * enter the CLOSING state. */ case TCPS_FIN_WAIT_1: tcp_state_change(tp, TCPS_CLOSING); break; /* * In FIN_WAIT_2 state enter the TIME_WAIT state, * starting the time-wait timer, turning off the other * standard timers. */ case TCPS_FIN_WAIT_2: tcp_twstart(tp); return; } } #ifdef TCPDEBUG if (so->so_options & SO_DEBUG) tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif TCP_PROBE3(debug__input, tp, th, m); /* * Return any desired output. */ if (needoutput || (tp->t_flags & TF_ACKNOW)) (void) tp->t_fb->tfb_tcp_output(tp); check_delack: INP_WLOCK_ASSERT(tp->t_inpcb); if (tp->t_flags & TF_DELACK) { tp->t_flags &= ~TF_DELACK; tcp_timer_activate(tp, TT_DELACK, tcp_delacktime); } INP_WUNLOCK(tp->t_inpcb); return; dropafterack: /* * Generate an ACK dropping incoming segment if it occupies * sequence space, where the ACK reflects our state. * * We can now skip the test for the RST flag since all * paths to this code happen after packets containing * RST have been dropped. * * In the SYN-RECEIVED state, don't send an ACK unless the * segment we received passes the SYN-RECEIVED ACK test. * If it fails send a RST. This breaks the loop in the * "LAND" DoS attack, and also prevents an ACK storm * between two listening ports that have been sent forged * SYN segments, each with the source address of the other. */ if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) && (SEQ_GT(tp->snd_una, th->th_ack) || SEQ_GT(th->th_ack, tp->snd_max)) ) { rstreason = BANDLIM_RST_OPENPORT; goto dropwithreset; } #ifdef TCPDEBUG if (so->so_options & SO_DEBUG) tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif TCP_PROBE3(debug__input, tp, th, m); tp->t_flags |= TF_ACKNOW; (void) tp->t_fb->tfb_tcp_output(tp); INP_WUNLOCK(tp->t_inpcb); m_freem(m); return; dropwithreset: if (tp != NULL) { tcp_dropwithreset(m, th, tp, tlen, rstreason); INP_WUNLOCK(tp->t_inpcb); } else tcp_dropwithreset(m, th, NULL, tlen, rstreason); return; drop: /* * Drop space held by incoming segment and return. */ #ifdef TCPDEBUG if (tp == NULL || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif TCP_PROBE3(debug__input, tp, th, m); if (tp != NULL) { INP_WUNLOCK(tp->t_inpcb); } m_freem(m); } /* * Issue RST and make ACK acceptable to originator of segment. * The mbuf must still include the original packet header. * tp may be NULL. */ void tcp_dropwithreset(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int tlen, int rstreason) { #ifdef INET struct ip *ip; #endif #ifdef INET6 struct ip6_hdr *ip6; #endif if (tp != NULL) { INP_LOCK_ASSERT(tp->t_inpcb); } /* Don't bother if destination was broadcast/multicast. */ if ((th->th_flags & TH_RST) || m->m_flags & (M_BCAST|M_MCAST)) goto drop; #ifdef INET6 if (mtod(m, struct ip *)->ip_v == 6) { ip6 = mtod(m, struct ip6_hdr *); if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) || IN6_IS_ADDR_MULTICAST(&ip6->ip6_src)) goto drop; /* IPv6 anycast check is done at tcp6_input() */ } #endif #if defined(INET) && defined(INET6) else #endif #ifdef INET { ip = mtod(m, struct ip *); if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) || IN_MULTICAST(ntohl(ip->ip_src.s_addr)) || ip->ip_src.s_addr == htonl(INADDR_BROADCAST) || in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) goto drop; } #endif /* Perform bandwidth limiting. */ if (badport_bandlim(rstreason) < 0) goto drop; /* tcp_respond consumes the mbuf chain. */ if (th->th_flags & TH_ACK) { tcp_respond(tp, mtod(m, void *), th, m, (tcp_seq)0, th->th_ack, TH_RST); } else { if (th->th_flags & TH_SYN) tlen++; if (th->th_flags & TH_FIN) tlen++; tcp_respond(tp, mtod(m, void *), th, m, th->th_seq+tlen, (tcp_seq)0, TH_RST|TH_ACK); } return; drop: m_freem(m); } /* * Parse TCP options and place in tcpopt. */ void tcp_dooptions(struct tcpopt *to, u_char *cp, int cnt, int flags) { int opt, optlen; to->to_flags = 0; for (; cnt > 0; cnt -= optlen, cp += optlen) { opt = cp[0]; if (opt == TCPOPT_EOL) break; if (opt == TCPOPT_NOP) optlen = 1; else { if (cnt < 2) break; optlen = cp[1]; if (optlen < 2 || optlen > cnt) break; } switch (opt) { case TCPOPT_MAXSEG: if (optlen != TCPOLEN_MAXSEG) continue; if (!(flags & TO_SYN)) continue; to->to_flags |= TOF_MSS; bcopy((char *)cp + 2, (char *)&to->to_mss, sizeof(to->to_mss)); to->to_mss = ntohs(to->to_mss); break; case TCPOPT_WINDOW: if (optlen != TCPOLEN_WINDOW) continue; if (!(flags & TO_SYN)) continue; to->to_flags |= TOF_SCALE; to->to_wscale = min(cp[2], TCP_MAX_WINSHIFT); break; case TCPOPT_TIMESTAMP: if (optlen != TCPOLEN_TIMESTAMP) continue; to->to_flags |= TOF_TS; bcopy((char *)cp + 2, (char *)&to->to_tsval, sizeof(to->to_tsval)); to->to_tsval = ntohl(to->to_tsval); bcopy((char *)cp + 6, (char *)&to->to_tsecr, sizeof(to->to_tsecr)); to->to_tsecr = ntohl(to->to_tsecr); break; case TCPOPT_SIGNATURE: /* * In order to reply to a host which has set the * TCP_SIGNATURE option in its initial SYN, we have * to record the fact that the option was observed * here for the syncache code to perform the correct * response. */ if (optlen != TCPOLEN_SIGNATURE) continue; to->to_flags |= TOF_SIGNATURE; to->to_signature = cp + 2; break; case TCPOPT_SACK_PERMITTED: if (optlen != TCPOLEN_SACK_PERMITTED) continue; if (!(flags & TO_SYN)) continue; if (!V_tcp_do_sack) continue; to->to_flags |= TOF_SACKPERM; break; case TCPOPT_SACK: if (optlen <= 2 || (optlen - 2) % TCPOLEN_SACK != 0) continue; if (flags & TO_SYN) continue; to->to_flags |= TOF_SACK; to->to_nsacks = (optlen - 2) / TCPOLEN_SACK; to->to_sacks = cp + 2; TCPSTAT_INC(tcps_sack_rcv_blocks); break; case TCPOPT_FAST_OPEN: /* * Cookie length validation is performed by the * server side cookie checking code or the client * side cookie cache update code. */ if (!(flags & TO_SYN)) continue; if (!V_tcp_fastopen_client_enable && !V_tcp_fastopen_server_enable) continue; to->to_flags |= TOF_FASTOPEN; to->to_tfo_len = optlen - 2; to->to_tfo_cookie = to->to_tfo_len ? cp + 2 : NULL; break; default: continue; } } } /* * Pull out of band byte out of a segment so * it doesn't appear in the user's data queue. * It is still reflected in the segment length for * sequencing purposes. */ void tcp_pulloutofband(struct socket *so, struct tcphdr *th, struct mbuf *m, int off) { int cnt = off + th->th_urp - 1; while (cnt >= 0) { if (m->m_len > cnt) { char *cp = mtod(m, caddr_t) + cnt; struct tcpcb *tp = sototcpcb(so); INP_WLOCK_ASSERT(tp->t_inpcb); tp->t_iobc = *cp; tp->t_oobflags |= TCPOOB_HAVEDATA; bcopy(cp+1, cp, (unsigned)(m->m_len - cnt - 1)); m->m_len--; if (m->m_flags & M_PKTHDR) m->m_pkthdr.len--; return; } cnt -= m->m_len; m = m->m_next; if (m == NULL) break; } panic("tcp_pulloutofband"); } /* * Collect new round-trip time estimate * and update averages and current timeout. */ void tcp_xmit_timer(struct tcpcb *tp, int rtt) { int delta; INP_WLOCK_ASSERT(tp->t_inpcb); TCPSTAT_INC(tcps_rttupdated); tp->t_rttupdated++; #ifdef STATS stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RTT, imax(0, rtt * 1000 / hz)); #endif if ((tp->t_srtt != 0) && (tp->t_rxtshift <= TCP_RTT_INVALIDATE)) { /* * srtt is stored as fixed point with 5 bits after the * binary point (i.e., scaled by 8). The following magic * is equivalent to the smoothing algorithm in rfc793 with * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed * point). Adjust rtt to origin 0. */ delta = ((rtt - 1) << TCP_DELTA_SHIFT) - (tp->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT)); if ((tp->t_srtt += delta) <= 0) tp->t_srtt = 1; /* * We accumulate a smoothed rtt variance (actually, a * smoothed mean difference), then set the retransmit * timer to smoothed rtt + 4 times the smoothed variance. * rttvar is stored as fixed point with 4 bits after the * binary point (scaled by 16). The following is * equivalent to rfc793 smoothing with an alpha of .75 * (rttvar = rttvar*3/4 + |delta| / 4). This replaces * rfc793's wired-in beta. */ if (delta < 0) delta = -delta; delta -= tp->t_rttvar >> (TCP_RTTVAR_SHIFT - TCP_DELTA_SHIFT); if ((tp->t_rttvar += delta) <= 0) tp->t_rttvar = 1; if (tp->t_rttbest > tp->t_srtt + tp->t_rttvar) tp->t_rttbest = tp->t_srtt + tp->t_rttvar; } else { /* * No rtt measurement yet - use the unsmoothed rtt. * Set the variance to half the rtt (so our first * retransmit happens at 3*rtt). */ tp->t_srtt = rtt << TCP_RTT_SHIFT; tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - 1); tp->t_rttbest = tp->t_srtt + tp->t_rttvar; } tp->t_rtttime = 0; tp->t_rxtshift = 0; /* * the retransmit should happen at rtt + 4 * rttvar. * Because of the way we do the smoothing, srtt and rttvar * will each average +1/2 tick of bias. When we compute * the retransmit timer, we want 1/2 tick of rounding and * 1 extra tick because of +-1/2 tick uncertainty in the * firing of the timer. The bias will give us exactly the * 1.5 tick we need. But, because the bias is * statistical, we have to test that we don't drop below * the minimum feasible timer (which is 2 ticks). */ TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp), max(tp->t_rttmin, rtt + 2), TCPTV_REXMTMAX); /* * We received an ack for a packet that wasn't retransmitted; * it is probably safe to discard any error indications we've * received recently. This isn't quite right, but close enough * for now (a route might have failed after we sent a segment, * and the return path might not be symmetrical). */ tp->t_softerror = 0; } /* * Determine a reasonable value for maxseg size. * If the route is known, check route for mtu. * If none, use an mss that can be handled on the outgoing interface * without forcing IP to fragment. If no route is found, route has no mtu, * or the destination isn't local, use a default, hopefully conservative * size (usually 512 or the default IP max size, but no more than the mtu * of the interface), as we can't discover anything about intervening * gateways or networks. We also initialize the congestion/slow start * window to be a single segment if the destination isn't local. * While looking at the routing entry, we also initialize other path-dependent * parameters from pre-set or cached values in the routing entry. * * NOTE that resulting t_maxseg doesn't include space for TCP options or * IP options, e.g. IPSEC data, since length of this data may vary, and * thus it is calculated for every segment separately in tcp_output(). * * NOTE that this routine is only called when we process an incoming * segment, or an ICMP need fragmentation datagram. Outgoing SYN/ACK MSS * settings are handled in tcp_mssopt(). */ void tcp_mss_update(struct tcpcb *tp, int offer, int mtuoffer, struct hc_metrics_lite *metricptr, struct tcp_ifcap *cap) { int mss = 0; uint32_t maxmtu = 0; struct inpcb *inp = tp->t_inpcb; struct hc_metrics_lite metrics; #ifdef INET6 int isipv6 = ((inp->inp_vflag & INP_IPV6) != 0) ? 1 : 0; size_t min_protoh = isipv6 ? sizeof (struct ip6_hdr) + sizeof (struct tcphdr) : sizeof (struct tcpiphdr); #else size_t min_protoh = sizeof(struct tcpiphdr); #endif INP_WLOCK_ASSERT(tp->t_inpcb); if (tp->t_port) min_protoh += V_tcp_udp_tunneling_overhead; if (mtuoffer != -1) { KASSERT(offer == -1, ("%s: conflict", __func__)); offer = mtuoffer - min_protoh; } /* Initialize. */ #ifdef INET6 if (isipv6) { maxmtu = tcp_maxmtu6(&inp->inp_inc, cap); tp->t_maxseg = V_tcp_v6mssdflt; } #endif #if defined(INET) && defined(INET6) else #endif #ifdef INET { maxmtu = tcp_maxmtu(&inp->inp_inc, cap); tp->t_maxseg = V_tcp_mssdflt; } #endif /* * No route to sender, stay with default mss and return. */ if (maxmtu == 0) { /* * In case we return early we need to initialize metrics * to a defined state as tcp_hc_get() would do for us * if there was no cache hit. */ if (metricptr != NULL) bzero(metricptr, sizeof(struct hc_metrics_lite)); return; } /* What have we got? */ switch (offer) { case 0: /* * Offer == 0 means that there was no MSS on the SYN * segment, in this case we use tcp_mssdflt as * already assigned to t_maxseg above. */ offer = tp->t_maxseg; break; case -1: /* * Offer == -1 means that we didn't receive SYN yet. */ /* FALLTHROUGH */ default: /* * Prevent DoS attack with too small MSS. Round up * to at least minmss. */ offer = max(offer, V_tcp_minmss); } /* * rmx information is now retrieved from tcp_hostcache. */ tcp_hc_get(&inp->inp_inc, &metrics); if (metricptr != NULL) bcopy(&metrics, metricptr, sizeof(struct hc_metrics_lite)); /* * If there's a discovered mtu in tcp hostcache, use it. * Else, use the link mtu. */ if (metrics.rmx_mtu) mss = min(metrics.rmx_mtu, maxmtu) - min_protoh; else { #ifdef INET6 if (isipv6) { mss = maxmtu - min_protoh; if (!V_path_mtu_discovery && !in6_localaddr(&inp->in6p_faddr)) mss = min(mss, V_tcp_v6mssdflt); } #endif #if defined(INET) && defined(INET6) else #endif #ifdef INET { mss = maxmtu - min_protoh; if (!V_path_mtu_discovery && !in_localaddr(inp->inp_faddr)) mss = min(mss, V_tcp_mssdflt); } #endif /* * XXX - The above conditional (mss = maxmtu - min_protoh) * probably violates the TCP spec. * The problem is that, since we don't know the * other end's MSS, we are supposed to use a conservative * default. But, if we do that, then MTU discovery will * never actually take place, because the conservative * default is much less than the MTUs typically seen * on the Internet today. For the moment, we'll sweep * this under the carpet. * * The conservative default might not actually be a problem * if the only case this occurs is when sending an initial * SYN with options and data to a host we've never talked * to before. Then, they will reply with an MSS value which * will get recorded and the new parameters should get * recomputed. For Further Study. */ } mss = min(mss, offer); /* * Sanity check: make sure that maxseg will be large * enough to allow some data on segments even if the * all the option space is used (40bytes). Otherwise * funny things may happen in tcp_output. * * XXXGL: shouldn't we reserve space for IP/IPv6 options? */ mss = max(mss, 64); tp->t_maxseg = mss; } void tcp_mss(struct tcpcb *tp, int offer) { int mss; uint32_t bufsize; struct inpcb *inp; struct socket *so; struct hc_metrics_lite metrics; struct tcp_ifcap cap; KASSERT(tp != NULL, ("%s: tp == NULL", __func__)); bzero(&cap, sizeof(cap)); tcp_mss_update(tp, offer, -1, &metrics, &cap); mss = tp->t_maxseg; inp = tp->t_inpcb; /* * If there's a pipesize, change the socket buffer to that size, * don't change if sb_hiwat is different than default (then it * has been changed on purpose with setsockopt). * Make the socket buffers an integral number of mss units; * if the mss is larger than the socket buffer, decrease the mss. */ so = inp->inp_socket; SOCKBUF_LOCK(&so->so_snd); if ((so->so_snd.sb_hiwat == V_tcp_sendspace) && metrics.rmx_sendpipe) bufsize = metrics.rmx_sendpipe; else bufsize = so->so_snd.sb_hiwat; if (bufsize < mss) mss = bufsize; else { bufsize = roundup(bufsize, mss); if (bufsize > sb_max) bufsize = sb_max; if (bufsize > so->so_snd.sb_hiwat) (void)sbreserve_locked(&so->so_snd, bufsize, so, NULL); } SOCKBUF_UNLOCK(&so->so_snd); /* * Sanity check: make sure that maxseg will be large * enough to allow some data on segments even if the * all the option space is used (40bytes). Otherwise * funny things may happen in tcp_output. * * XXXGL: shouldn't we reserve space for IP/IPv6 options? */ tp->t_maxseg = max(mss, 64); SOCKBUF_LOCK(&so->so_rcv); if ((so->so_rcv.sb_hiwat == V_tcp_recvspace) && metrics.rmx_recvpipe) bufsize = metrics.rmx_recvpipe; else bufsize = so->so_rcv.sb_hiwat; if (bufsize > mss) { bufsize = roundup(bufsize, mss); if (bufsize > sb_max) bufsize = sb_max; if (bufsize > so->so_rcv.sb_hiwat) (void)sbreserve_locked(&so->so_rcv, bufsize, so, NULL); } SOCKBUF_UNLOCK(&so->so_rcv); /* Check the interface for TSO capabilities. */ if (cap.ifcap & CSUM_TSO) { tp->t_flags |= TF_TSO; tp->t_tsomax = cap.tsomax; tp->t_tsomaxsegcount = cap.tsomaxsegcount; tp->t_tsomaxsegsize = cap.tsomaxsegsize; } } /* * Determine the MSS option to send on an outgoing SYN. */ int tcp_mssopt(struct in_conninfo *inc) { int mss = 0; uint32_t thcmtu = 0; uint32_t maxmtu = 0; size_t min_protoh; KASSERT(inc != NULL, ("tcp_mssopt with NULL in_conninfo pointer")); #ifdef INET6 if (inc->inc_flags & INC_ISIPV6) { mss = V_tcp_v6mssdflt; maxmtu = tcp_maxmtu6(inc, NULL); min_protoh = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); } #endif #if defined(INET) && defined(INET6) else #endif #ifdef INET { mss = V_tcp_mssdflt; maxmtu = tcp_maxmtu(inc, NULL); min_protoh = sizeof(struct tcpiphdr); } #endif #if defined(INET6) || defined(INET) thcmtu = tcp_hc_getmtu(inc); /* IPv4 and IPv6 */ #endif if (maxmtu && thcmtu) mss = min(maxmtu, thcmtu) - min_protoh; else if (maxmtu || thcmtu) mss = max(maxmtu, thcmtu) - min_protoh; return (mss); } void tcp_do_prr_ack(struct tcpcb *tp, struct tcphdr *th, struct tcpopt *to) { int snd_cnt = 0, limit = 0, del_data = 0, pipe = 0; int maxseg = tcp_maxseg(tp); INP_WLOCK_ASSERT(tp->t_inpcb); /* * Compute the amount of data that this ACK is indicating * (del_data) and an estimate of how many bytes are in the * network. */ del_data = tp->sackhint.delivered_data; if (V_tcp_do_newsack) pipe = tcp_compute_pipe(tp); else pipe = (tp->snd_nxt - tp->snd_fack) + tp->sackhint.sack_bytes_rexmit; tp->sackhint.prr_delivered += del_data; /* * Proportional Rate Reduction */ if (pipe >= tp->snd_ssthresh) { if (tp->sackhint.recover_fs == 0) tp->sackhint.recover_fs = imax(1, tp->snd_nxt - tp->snd_una); snd_cnt = howmany((long)tp->sackhint.prr_delivered * tp->snd_ssthresh, tp->sackhint.recover_fs) - tp->sackhint.prr_out; } else { if (V_tcp_do_prr_conservative) limit = tp->sackhint.prr_delivered - tp->sackhint.prr_out; else limit = imax(tp->sackhint.prr_delivered - tp->sackhint.prr_out, del_data) + maxseg; snd_cnt = imin((tp->snd_ssthresh - pipe), limit); } snd_cnt = imax(snd_cnt, 0) / maxseg; /* * Send snd_cnt new data into the network in response to this ack. * If there is going to be a SACK retransmission, adjust snd_cwnd * accordingly. */ if (IN_FASTRECOVERY(tp->t_flags)) { tp->snd_cwnd = imax(maxseg, tp->snd_nxt - tp->snd_recover + tp->sackhint.sack_bytes_rexmit + (snd_cnt * maxseg)); } else if (IN_CONGRECOVERY(tp->t_flags)) tp->snd_cwnd = imax(maxseg, pipe - del_data + (snd_cnt * maxseg)); } /* * On a partial ack arrives, force the retransmission of the * next unacknowledged segment. Do not clear tp->t_dupacks. * By setting snd_nxt to ti_ack, this forces retransmission timer to * be started again. */ void tcp_newreno_partial_ack(struct tcpcb *tp, struct tcphdr *th) { tcp_seq onxt = tp->snd_nxt; uint32_t ocwnd = tp->snd_cwnd; u_int maxseg = tcp_maxseg(tp); INP_WLOCK_ASSERT(tp->t_inpcb); tcp_timer_activate(tp, TT_REXMT, 0); tp->t_rtttime = 0; tp->snd_nxt = th->th_ack; /* * Set snd_cwnd to one segment beyond acknowledged offset. * (tp->snd_una has not yet been updated when this function is called.) */ tp->snd_cwnd = maxseg + BYTES_THIS_ACK(tp, th); tp->t_flags |= TF_ACKNOW; (void) tp->t_fb->tfb_tcp_output(tp); tp->snd_cwnd = ocwnd; if (SEQ_GT(onxt, tp->snd_nxt)) tp->snd_nxt = onxt; /* * Partial window deflation. Relies on fact that tp->snd_una * not updated yet. */ if (tp->snd_cwnd > BYTES_THIS_ACK(tp, th)) tp->snd_cwnd -= BYTES_THIS_ACK(tp, th); else tp->snd_cwnd = 0; tp->snd_cwnd += maxseg; } int tcp_compute_pipe(struct tcpcb *tp) { return (tp->snd_max - tp->snd_una + tp->sackhint.sack_bytes_rexmit - tp->sackhint.sacked_bytes); } uint32_t tcp_compute_initwnd(uint32_t maxseg) { /* * Calculate the Initial Window, also used as Restart Window * * RFC5681 Section 3.1 specifies the default conservative values. * RFC3390 specifies slightly more aggressive values. * RFC6928 increases it to ten segments. * Support for user specified value for initial flight size. */ if (V_tcp_initcwnd_segments) return min(V_tcp_initcwnd_segments * maxseg, max(2 * maxseg, V_tcp_initcwnd_segments * 1460)); else if (V_tcp_do_rfc3390) return min(4 * maxseg, max(2 * maxseg, 4380)); else { /* Per RFC5681 Section 3.1 */ if (maxseg > 2190) return (2 * maxseg); else if (maxseg > 1095) return (3 * maxseg); else return (4 * maxseg); } } diff --git a/sys/netinet/tcp_subr.c b/sys/netinet/tcp_subr.c index de22310d241a..a1531ea8d2f3 100644 --- a/sys/netinet/tcp_subr.c +++ b/sys/netinet/tcp_subr.c @@ -1,4096 +1,4096 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)tcp_subr.c 8.2 (Berkeley) 5/24/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_kern_tls.h" #include "opt_tcpdebug.h" #include #include #include #include #include #ifdef TCP_HHOOK #include #endif #include #ifdef TCP_HHOOK #include #endif #ifdef KERN_TLS #include #endif #include #include #include #include #include #include #include #ifdef INET6 #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #include #include #include #include #include #include #endif #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #endif #include #include #ifdef TCPPCAP #include #endif #ifdef TCPDEBUG #include #endif #ifdef INET6 #include #endif #ifdef TCP_OFFLOAD #include #endif #include #include #include #include #include #include VNET_DEFINE(int, tcp_mssdflt) = TCP_MSS; #ifdef INET6 VNET_DEFINE(int, tcp_v6mssdflt) = TCP6_MSS; #endif #ifdef NETFLIX_EXP_DETECTION /* Sack attack detection thresholds and such */ SYSCTL_NODE(_net_inet_tcp, OID_AUTO, sack_attack, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Sack Attack detection thresholds"); int32_t tcp_force_detection = 0; SYSCTL_INT(_net_inet_tcp_sack_attack, OID_AUTO, force_detection, CTLFLAG_RW, &tcp_force_detection, 0, "Do we force detection even if the INP has it off?"); int32_t tcp_sack_to_ack_thresh = 700; /* 70 % */ SYSCTL_INT(_net_inet_tcp_sack_attack, OID_AUTO, sack_to_ack_thresh, CTLFLAG_RW, &tcp_sack_to_ack_thresh, 700, "Percentage of sacks to acks we must see above (10.1 percent is 101)?"); int32_t tcp_sack_to_move_thresh = 600; /* 60 % */ SYSCTL_INT(_net_inet_tcp_sack_attack, OID_AUTO, move_thresh, CTLFLAG_RW, &tcp_sack_to_move_thresh, 600, "Percentage of sack moves we must see above (10.1 percent is 101)"); int32_t tcp_restoral_thresh = 650; /* 65 % (sack:2:ack -5%) */ SYSCTL_INT(_net_inet_tcp_sack_attack, OID_AUTO, restore_thresh, CTLFLAG_RW, &tcp_restoral_thresh, 550, "Percentage of sack to ack percentage we must see below to restore(10.1 percent is 101)"); int32_t tcp_sad_decay_val = 800; SYSCTL_INT(_net_inet_tcp_sack_attack, OID_AUTO, decay_per, CTLFLAG_RW, &tcp_sad_decay_val, 800, "The decay percentage (10.1 percent equals 101 )"); int32_t tcp_map_minimum = 500; SYSCTL_INT(_net_inet_tcp_sack_attack, OID_AUTO, nummaps, CTLFLAG_RW, &tcp_map_minimum, 500, "Number of Map enteries before we start detection"); int32_t tcp_attack_on_turns_on_logging = 0; SYSCTL_INT(_net_inet_tcp_sack_attack, OID_AUTO, attacks_logged, CTLFLAG_RW, &tcp_attack_on_turns_on_logging, 0, "When we have a positive hit on attack, do we turn on logging?"); int32_t tcp_sad_pacing_interval = 2000; SYSCTL_INT(_net_inet_tcp_sack_attack, OID_AUTO, sad_pacing_int, CTLFLAG_RW, &tcp_sad_pacing_interval, 2000, "What is the minimum pacing interval for a classified attacker?"); int32_t tcp_sad_low_pps = 100; SYSCTL_INT(_net_inet_tcp_sack_attack, OID_AUTO, sad_low_pps, CTLFLAG_RW, &tcp_sad_low_pps, 100, "What is the input pps that below which we do not decay?"); #endif uint32_t tcp_ack_war_time_window = 1000; SYSCTL_UINT(_net_inet_tcp, OID_AUTO, ack_war_timewindow, CTLFLAG_RW, &tcp_ack_war_time_window, 1000, "If the tcp_stack does ack-war prevention how many milliseconds are in its time window?"); uint32_t tcp_ack_war_cnt = 5; SYSCTL_UINT(_net_inet_tcp, OID_AUTO, ack_war_cnt, CTLFLAG_RW, &tcp_ack_war_cnt, 5, "If the tcp_stack does ack-war prevention how many acks can be sent in its time window?"); struct rwlock tcp_function_lock; static int sysctl_net_inet_tcp_mss_check(SYSCTL_HANDLER_ARGS) { int error, new; new = V_tcp_mssdflt; error = sysctl_handle_int(oidp, &new, 0, req); if (error == 0 && req->newptr) { if (new < TCP_MINMSS) error = EINVAL; else V_tcp_mssdflt = new; } return (error); } SYSCTL_PROC(_net_inet_tcp, TCPCTL_MSSDFLT, mssdflt, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, &VNET_NAME(tcp_mssdflt), 0, &sysctl_net_inet_tcp_mss_check, "I", "Default TCP Maximum Segment Size"); #ifdef INET6 static int sysctl_net_inet_tcp_mss_v6_check(SYSCTL_HANDLER_ARGS) { int error, new; new = V_tcp_v6mssdflt; error = sysctl_handle_int(oidp, &new, 0, req); if (error == 0 && req->newptr) { if (new < TCP_MINMSS) error = EINVAL; else V_tcp_v6mssdflt = new; } return (error); } SYSCTL_PROC(_net_inet_tcp, TCPCTL_V6MSSDFLT, v6mssdflt, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, &VNET_NAME(tcp_v6mssdflt), 0, &sysctl_net_inet_tcp_mss_v6_check, "I", "Default TCP Maximum Segment Size for IPv6"); #endif /* INET6 */ /* * Minimum MSS we accept and use. This prevents DoS attacks where * we are forced to a ridiculous low MSS like 20 and send hundreds * of packets instead of one. The effect scales with the available * bandwidth and quickly saturates the CPU and network interface * with packet generation and sending. Set to zero to disable MINMSS * checking. This setting prevents us from sending too small packets. */ VNET_DEFINE(int, tcp_minmss) = TCP_MINMSS; SYSCTL_INT(_net_inet_tcp, OID_AUTO, minmss, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_minmss), 0, "Minimum TCP Maximum Segment Size"); VNET_DEFINE(int, tcp_do_rfc1323) = 1; SYSCTL_INT(_net_inet_tcp, TCPCTL_DO_RFC1323, rfc1323, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_do_rfc1323), 0, "Enable rfc1323 (high performance TCP) extensions"); VNET_DEFINE(int, tcp_tolerate_missing_ts) = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, tolerate_missing_ts, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_tolerate_missing_ts), 0, "Tolerate missing TCP timestamps"); VNET_DEFINE(int, tcp_ts_offset_per_conn) = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, ts_offset_per_conn, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_ts_offset_per_conn), 0, "Initialize TCP timestamps per connection instead of per host pair"); /* How many connections are pacing */ static volatile uint32_t number_of_tcp_connections_pacing = 0; static uint32_t shadow_num_connections = 0; static int tcp_pacing_limit = 10000; SYSCTL_INT(_net_inet_tcp, OID_AUTO, pacing_limit, CTLFLAG_RW, &tcp_pacing_limit, 1000, "If the TCP stack does pacing, is there a limit (-1 = no, 0 = no pacing N = number of connections)"); SYSCTL_UINT(_net_inet_tcp, OID_AUTO, pacing_count, CTLFLAG_RD, &shadow_num_connections, 0, "Number of TCP connections being paced"); static int tcp_log_debug = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_debug, CTLFLAG_RW, &tcp_log_debug, 0, "Log errors caused by incoming TCP segments"); static int tcp_tcbhashsize; SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcbhashsize, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &tcp_tcbhashsize, 0, "Size of TCP control-block hashtable"); static int do_tcpdrain = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, do_tcpdrain, CTLFLAG_RW, &do_tcpdrain, 0, "Enable tcp_drain routine for extra help when low on mbufs"); SYSCTL_UINT(_net_inet_tcp, OID_AUTO, pcbcount, CTLFLAG_VNET | CTLFLAG_RD, &VNET_NAME(tcbinfo.ipi_count), 0, "Number of active PCBs"); VNET_DEFINE_STATIC(int, icmp_may_rst) = 1; #define V_icmp_may_rst VNET(icmp_may_rst) SYSCTL_INT(_net_inet_tcp, OID_AUTO, icmp_may_rst, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(icmp_may_rst), 0, "Certain ICMP unreachable messages may abort connections in SYN_SENT"); VNET_DEFINE_STATIC(int, tcp_isn_reseed_interval) = 0; #define V_tcp_isn_reseed_interval VNET(tcp_isn_reseed_interval) SYSCTL_INT(_net_inet_tcp, OID_AUTO, isn_reseed_interval, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_isn_reseed_interval), 0, "Seconds between reseeding of ISN secret"); static int tcp_soreceive_stream; SYSCTL_INT(_net_inet_tcp, OID_AUTO, soreceive_stream, CTLFLAG_RDTUN, &tcp_soreceive_stream, 0, "Using soreceive_stream for TCP sockets"); VNET_DEFINE(uma_zone_t, sack_hole_zone); #define V_sack_hole_zone VNET(sack_hole_zone) VNET_DEFINE(uint32_t, tcp_map_entries_limit) = 0; /* unlimited */ static int sysctl_net_inet_tcp_map_limit_check(SYSCTL_HANDLER_ARGS) { int error; uint32_t new; new = V_tcp_map_entries_limit; error = sysctl_handle_int(oidp, &new, 0, req); if (error == 0 && req->newptr) { /* only allow "0" and value > minimum */ if (new > 0 && new < TCP_MIN_MAP_ENTRIES_LIMIT) error = EINVAL; else V_tcp_map_entries_limit = new; } return (error); } SYSCTL_PROC(_net_inet_tcp, OID_AUTO, map_limit, CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, &VNET_NAME(tcp_map_entries_limit), 0, &sysctl_net_inet_tcp_map_limit_check, "IU", "Total sendmap entries limit"); VNET_DEFINE(uint32_t, tcp_map_split_limit) = 0; /* unlimited */ SYSCTL_UINT(_net_inet_tcp, OID_AUTO, split_limit, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_map_split_limit), 0, "Total sendmap split entries limit"); #ifdef TCP_HHOOK VNET_DEFINE(struct hhook_head *, tcp_hhh[HHOOK_TCP_LAST+1]); #endif #define TS_OFFSET_SECRET_LENGTH SIPHASH_KEY_LENGTH VNET_DEFINE_STATIC(u_char, ts_offset_secret[TS_OFFSET_SECRET_LENGTH]); #define V_ts_offset_secret VNET(ts_offset_secret) static int tcp_default_fb_init(struct tcpcb *tp); static void tcp_default_fb_fini(struct tcpcb *tp, int tcb_is_purged); static int tcp_default_handoff_ok(struct tcpcb *tp); static struct inpcb *tcp_notify(struct inpcb *, int); static struct inpcb *tcp_mtudisc_notify(struct inpcb *, int); static void tcp_mtudisc(struct inpcb *, int); static char * tcp_log_addr(struct in_conninfo *inc, struct tcphdr *th, void *ip4hdr, const void *ip6hdr); static struct tcp_function_block tcp_def_funcblk = { .tfb_tcp_block_name = "freebsd", .tfb_tcp_output = tcp_output, .tfb_tcp_do_segment = tcp_do_segment, .tfb_tcp_ctloutput = tcp_default_ctloutput, .tfb_tcp_handoff_ok = tcp_default_handoff_ok, .tfb_tcp_fb_init = tcp_default_fb_init, .tfb_tcp_fb_fini = tcp_default_fb_fini, }; static int tcp_fb_cnt = 0; struct tcp_funchead t_functions; static struct tcp_function_block *tcp_func_set_ptr = &tcp_def_funcblk; static struct tcp_function_block * find_tcp_functions_locked(struct tcp_function_set *fs) { struct tcp_function *f; struct tcp_function_block *blk=NULL; TAILQ_FOREACH(f, &t_functions, tf_next) { if (strcmp(f->tf_name, fs->function_set_name) == 0) { blk = f->tf_fb; break; } } return(blk); } static struct tcp_function_block * find_tcp_fb_locked(struct tcp_function_block *blk, struct tcp_function **s) { struct tcp_function_block *rblk=NULL; struct tcp_function *f; TAILQ_FOREACH(f, &t_functions, tf_next) { if (f->tf_fb == blk) { rblk = blk; if (s) { *s = f; } break; } } return (rblk); } struct tcp_function_block * find_and_ref_tcp_functions(struct tcp_function_set *fs) { struct tcp_function_block *blk; rw_rlock(&tcp_function_lock); blk = find_tcp_functions_locked(fs); if (blk) refcount_acquire(&blk->tfb_refcnt); rw_runlock(&tcp_function_lock); return(blk); } struct tcp_function_block * find_and_ref_tcp_fb(struct tcp_function_block *blk) { struct tcp_function_block *rblk; rw_rlock(&tcp_function_lock); rblk = find_tcp_fb_locked(blk, NULL); if (rblk) refcount_acquire(&rblk->tfb_refcnt); rw_runlock(&tcp_function_lock); return(rblk); } static struct tcp_function_block * find_and_ref_tcp_default_fb(void) { struct tcp_function_block *rblk; rw_rlock(&tcp_function_lock); rblk = tcp_func_set_ptr; refcount_acquire(&rblk->tfb_refcnt); rw_runlock(&tcp_function_lock); return (rblk); } void tcp_switch_back_to_default(struct tcpcb *tp) { struct tcp_function_block *tfb; KASSERT(tp->t_fb != &tcp_def_funcblk, ("%s: called by the built-in default stack", __func__)); /* * Release the old stack. This function will either find a new one * or panic. */ if (tp->t_fb->tfb_tcp_fb_fini != NULL) (*tp->t_fb->tfb_tcp_fb_fini)(tp, 0); refcount_release(&tp->t_fb->tfb_refcnt); /* * Now, we'll find a new function block to use. * Start by trying the current user-selected * default, unless this stack is the user-selected * default. */ tfb = find_and_ref_tcp_default_fb(); if (tfb == tp->t_fb) { refcount_release(&tfb->tfb_refcnt); tfb = NULL; } /* Does the stack accept this connection? */ if (tfb != NULL && tfb->tfb_tcp_handoff_ok != NULL && (*tfb->tfb_tcp_handoff_ok)(tp)) { refcount_release(&tfb->tfb_refcnt); tfb = NULL; } /* Try to use that stack. */ if (tfb != NULL) { /* Initialize the new stack. If it succeeds, we are done. */ tp->t_fb = tfb; if (tp->t_fb->tfb_tcp_fb_init == NULL || (*tp->t_fb->tfb_tcp_fb_init)(tp) == 0) return; /* * Initialization failed. Release the reference count on * the stack. */ refcount_release(&tfb->tfb_refcnt); } /* * If that wasn't feasible, use the built-in default * stack which is not allowed to reject anyone. */ tfb = find_and_ref_tcp_fb(&tcp_def_funcblk); if (tfb == NULL) { /* there always should be a default */ panic("Can't refer to tcp_def_funcblk"); } if (tfb->tfb_tcp_handoff_ok != NULL) { if ((*tfb->tfb_tcp_handoff_ok) (tp)) { /* The default stack cannot say no */ panic("Default stack rejects a new session?"); } } tp->t_fb = tfb; if (tp->t_fb->tfb_tcp_fb_init != NULL && (*tp->t_fb->tfb_tcp_fb_init)(tp)) { /* The default stack cannot fail */ panic("Default stack initialization failed"); } } static void tcp_recv_udp_tunneled_packet(struct mbuf *m, int off, struct inpcb *inp, const struct sockaddr *sa, void *ctx) { struct ip *iph; #ifdef INET6 struct ip6_hdr *ip6; #endif struct udphdr *uh; struct tcphdr *th; int thlen; uint16_t port; TCPSTAT_INC(tcps_tunneled_pkts); if ((m->m_flags & M_PKTHDR) == 0) { /* Can't handle one that is not a pkt hdr */ TCPSTAT_INC(tcps_tunneled_errs); goto out; } thlen = sizeof(struct tcphdr); if (m->m_len < off + sizeof(struct udphdr) + thlen && (m = m_pullup(m, off + sizeof(struct udphdr) + thlen)) == NULL) { TCPSTAT_INC(tcps_tunneled_errs); goto out; } iph = mtod(m, struct ip *); uh = (struct udphdr *)((caddr_t)iph + off); th = (struct tcphdr *)(uh + 1); thlen = th->th_off << 2; if (m->m_len < off + sizeof(struct udphdr) + thlen) { m = m_pullup(m, off + sizeof(struct udphdr) + thlen); if (m == NULL) { TCPSTAT_INC(tcps_tunneled_errs); goto out; } else { iph = mtod(m, struct ip *); uh = (struct udphdr *)((caddr_t)iph + off); th = (struct tcphdr *)(uh + 1); } } m->m_pkthdr.tcp_tun_port = port = uh->uh_sport; bcopy(th, uh, m->m_len - off); m->m_len -= sizeof(struct udphdr); m->m_pkthdr.len -= sizeof(struct udphdr); /* * We use the same algorithm for * both UDP and TCP for c-sum. So * the code in tcp_input will skip * the checksum. So we do nothing * with the flag (m->m_pkthdr.csum_flags). */ switch (iph->ip_v) { #ifdef INET case IPVERSION: iph->ip_len = htons(ntohs(iph->ip_len) - sizeof(struct udphdr)); tcp_input_with_port(&m, &off, IPPROTO_TCP, port); break; #endif #ifdef INET6 case IPV6_VERSION >> 4: ip6 = mtod(m, struct ip6_hdr *); ip6->ip6_plen = htons(ntohs(ip6->ip6_plen) - sizeof(struct udphdr)); tcp6_input_with_port(&m, &off, IPPROTO_TCP, port); break; #endif default: goto out; break; } return; out: m_freem(m); } static int sysctl_net_inet_default_tcp_functions(SYSCTL_HANDLER_ARGS) { int error=ENOENT; struct tcp_function_set fs; struct tcp_function_block *blk; memset(&fs, 0, sizeof(fs)); rw_rlock(&tcp_function_lock); blk = find_tcp_fb_locked(tcp_func_set_ptr, NULL); if (blk) { /* Found him */ strcpy(fs.function_set_name, blk->tfb_tcp_block_name); fs.pcbcnt = blk->tfb_refcnt; } rw_runlock(&tcp_function_lock); error = sysctl_handle_string(oidp, fs.function_set_name, sizeof(fs.function_set_name), req); /* Check for error or no change */ if (error != 0 || req->newptr == NULL) return(error); rw_wlock(&tcp_function_lock); blk = find_tcp_functions_locked(&fs); if ((blk == NULL) || (blk->tfb_flags & TCP_FUNC_BEING_REMOVED)) { error = ENOENT; goto done; } tcp_func_set_ptr = blk; done: rw_wunlock(&tcp_function_lock); return (error); } SYSCTL_PROC(_net_inet_tcp, OID_AUTO, functions_default, CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_NEEDGIANT, NULL, 0, sysctl_net_inet_default_tcp_functions, "A", "Set/get the default TCP functions"); static int sysctl_net_inet_list_available(SYSCTL_HANDLER_ARGS) { int error, cnt, linesz; struct tcp_function *f; char *buffer, *cp; size_t bufsz, outsz; bool alias; cnt = 0; rw_rlock(&tcp_function_lock); TAILQ_FOREACH(f, &t_functions, tf_next) { cnt++; } rw_runlock(&tcp_function_lock); bufsz = (cnt+2) * ((TCP_FUNCTION_NAME_LEN_MAX * 2) + 13) + 1; buffer = malloc(bufsz, M_TEMP, M_WAITOK); error = 0; cp = buffer; linesz = snprintf(cp, bufsz, "\n%-32s%c %-32s %s\n", "Stack", 'D', "Alias", "PCB count"); cp += linesz; bufsz -= linesz; outsz = linesz; rw_rlock(&tcp_function_lock); TAILQ_FOREACH(f, &t_functions, tf_next) { alias = (f->tf_name != f->tf_fb->tfb_tcp_block_name); linesz = snprintf(cp, bufsz, "%-32s%c %-32s %u\n", f->tf_fb->tfb_tcp_block_name, (f->tf_fb == tcp_func_set_ptr) ? '*' : ' ', alias ? f->tf_name : "-", f->tf_fb->tfb_refcnt); if (linesz >= bufsz) { error = EOVERFLOW; break; } cp += linesz; bufsz -= linesz; outsz += linesz; } rw_runlock(&tcp_function_lock); if (error == 0) error = sysctl_handle_string(oidp, buffer, outsz + 1, req); free(buffer, M_TEMP); return (error); } SYSCTL_PROC(_net_inet_tcp, OID_AUTO, functions_available, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_NEEDGIANT, NULL, 0, sysctl_net_inet_list_available, "A", "list available TCP Function sets"); VNET_DEFINE(int, tcp_udp_tunneling_port) = TCP_TUNNELING_PORT_DEFAULT; #ifdef INET VNET_DEFINE(struct socket *, udp4_tun_socket) = NULL; #define V_udp4_tun_socket VNET(udp4_tun_socket) #endif #ifdef INET6 VNET_DEFINE(struct socket *, udp6_tun_socket) = NULL; #define V_udp6_tun_socket VNET(udp6_tun_socket) #endif static void tcp_over_udp_stop(void) { /* * This function assumes sysctl caller holds inp_rinfo_lock() * for writting! */ #ifdef INET if (V_udp4_tun_socket != NULL) { soclose(V_udp4_tun_socket); V_udp4_tun_socket = NULL; } #endif #ifdef INET6 if (V_udp6_tun_socket != NULL) { soclose(V_udp6_tun_socket); V_udp6_tun_socket = NULL; } #endif } static int tcp_over_udp_start(void) { uint16_t port; int ret; #ifdef INET struct sockaddr_in sin; #endif #ifdef INET6 struct sockaddr_in6 sin6; #endif /* * This function assumes sysctl caller holds inp_info_rlock() * for writting! */ port = V_tcp_udp_tunneling_port; if (ntohs(port) == 0) { /* Must have a port set */ return (EINVAL); } #ifdef INET if (V_udp4_tun_socket != NULL) { /* Already running -- must stop first */ return (EALREADY); } #endif #ifdef INET6 if (V_udp6_tun_socket != NULL) { /* Already running -- must stop first */ return (EALREADY); } #endif #ifdef INET if ((ret = socreate(PF_INET, &V_udp4_tun_socket, SOCK_DGRAM, IPPROTO_UDP, curthread->td_ucred, curthread))) { tcp_over_udp_stop(); return (ret); } /* Call the special UDP hook. */ if ((ret = udp_set_kernel_tunneling(V_udp4_tun_socket, tcp_recv_udp_tunneled_packet, tcp_ctlinput_viaudp, NULL))) { tcp_over_udp_stop(); return (ret); } /* Ok, we have a socket, bind it to the port. */ memset(&sin, 0, sizeof(struct sockaddr_in)); sin.sin_len = sizeof(struct sockaddr_in); sin.sin_family = AF_INET; sin.sin_port = htons(port); if ((ret = sobind(V_udp4_tun_socket, (struct sockaddr *)&sin, curthread))) { tcp_over_udp_stop(); return (ret); } #endif #ifdef INET6 if ((ret = socreate(PF_INET6, &V_udp6_tun_socket, SOCK_DGRAM, IPPROTO_UDP, curthread->td_ucred, curthread))) { tcp_over_udp_stop(); return (ret); } /* Call the special UDP hook. */ if ((ret = udp_set_kernel_tunneling(V_udp6_tun_socket, tcp_recv_udp_tunneled_packet, tcp6_ctlinput_viaudp, NULL))) { tcp_over_udp_stop(); return (ret); } /* Ok, we have a socket, bind it to the port. */ memset(&sin6, 0, sizeof(struct sockaddr_in6)); sin6.sin6_len = sizeof(struct sockaddr_in6); sin6.sin6_family = AF_INET6; sin6.sin6_port = htons(port); if ((ret = sobind(V_udp6_tun_socket, (struct sockaddr *)&sin6, curthread))) { tcp_over_udp_stop(); return (ret); } #endif return (0); } static int sysctl_net_inet_tcp_udp_tunneling_port_check(SYSCTL_HANDLER_ARGS) { int error; uint32_t old, new; old = V_tcp_udp_tunneling_port; new = old; error = sysctl_handle_int(oidp, &new, 0, req); if ((error == 0) && (req->newptr != NULL)) { if ((new < TCP_TUNNELING_PORT_MIN) || (new > TCP_TUNNELING_PORT_MAX)) { error = EINVAL; } else { V_tcp_udp_tunneling_port = new; if (old != 0) { tcp_over_udp_stop(); } if (new != 0) { error = tcp_over_udp_start(); } } } return (error); } SYSCTL_PROC(_net_inet_tcp, OID_AUTO, udp_tunneling_port, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, &VNET_NAME(tcp_udp_tunneling_port), 0, &sysctl_net_inet_tcp_udp_tunneling_port_check, "IU", "Tunneling port for tcp over udp"); VNET_DEFINE(int, tcp_udp_tunneling_overhead) = TCP_TUNNELING_OVERHEAD_DEFAULT; static int sysctl_net_inet_tcp_udp_tunneling_overhead_check(SYSCTL_HANDLER_ARGS) { int error, new; new = V_tcp_udp_tunneling_overhead; error = sysctl_handle_int(oidp, &new, 0, req); if (error == 0 && req->newptr) { if ((new < TCP_TUNNELING_OVERHEAD_MIN) || (new > TCP_TUNNELING_OVERHEAD_MAX)) error = EINVAL; else V_tcp_udp_tunneling_overhead = new; } return (error); } SYSCTL_PROC(_net_inet_tcp, OID_AUTO, udp_tunneling_overhead, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, &VNET_NAME(tcp_udp_tunneling_overhead), 0, &sysctl_net_inet_tcp_udp_tunneling_overhead_check, "IU", "MSS reduction when using tcp over udp"); /* * Exports one (struct tcp_function_info) for each alias/name. */ static int sysctl_net_inet_list_func_info(SYSCTL_HANDLER_ARGS) { int cnt, error; struct tcp_function *f; struct tcp_function_info tfi; /* * We don't allow writes. */ if (req->newptr != NULL) return (EINVAL); /* * Wire the old buffer so we can directly copy the functions to * user space without dropping the lock. */ if (req->oldptr != NULL) { error = sysctl_wire_old_buffer(req, 0); if (error) return (error); } /* * Walk the list and copy out matching entries. If INVARIANTS * is compiled in, also walk the list to verify the length of * the list matches what we have recorded. */ rw_rlock(&tcp_function_lock); cnt = 0; #ifndef INVARIANTS if (req->oldptr == NULL) { cnt = tcp_fb_cnt; goto skip_loop; } #endif TAILQ_FOREACH(f, &t_functions, tf_next) { #ifdef INVARIANTS cnt++; #endif if (req->oldptr != NULL) { bzero(&tfi, sizeof(tfi)); tfi.tfi_refcnt = f->tf_fb->tfb_refcnt; tfi.tfi_id = f->tf_fb->tfb_id; (void)strlcpy(tfi.tfi_alias, f->tf_name, sizeof(tfi.tfi_alias)); (void)strlcpy(tfi.tfi_name, f->tf_fb->tfb_tcp_block_name, sizeof(tfi.tfi_name)); error = SYSCTL_OUT(req, &tfi, sizeof(tfi)); /* * Don't stop on error, as that is the * mechanism we use to accumulate length * information if the buffer was too short. */ } } KASSERT(cnt == tcp_fb_cnt, ("%s: cnt (%d) != tcp_fb_cnt (%d)", __func__, cnt, tcp_fb_cnt)); #ifndef INVARIANTS skip_loop: #endif rw_runlock(&tcp_function_lock); if (req->oldptr == NULL) error = SYSCTL_OUT(req, NULL, (cnt + 1) * sizeof(struct tcp_function_info)); return (error); } SYSCTL_PROC(_net_inet_tcp, OID_AUTO, function_info, CTLTYPE_OPAQUE | CTLFLAG_SKIP | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_net_inet_list_func_info, "S,tcp_function_info", "List TCP function block name-to-ID mappings"); /* * tfb_tcp_handoff_ok() function for the default stack. * Note that we'll basically try to take all comers. */ static int tcp_default_handoff_ok(struct tcpcb *tp) { return (0); } /* * tfb_tcp_fb_init() function for the default stack. * * This handles making sure we have appropriate timers set if you are * transitioning a socket that has some amount of setup done. * * The init() fuction from the default can *never* return non-zero i.e. * it is required to always succeed since it is the stack of last resort! */ static int tcp_default_fb_init(struct tcpcb *tp) { struct socket *so; INP_WLOCK_ASSERT(tp->t_inpcb); KASSERT(tp->t_state >= 0 && tp->t_state < TCPS_TIME_WAIT, ("%s: connection %p in unexpected state %d", __func__, tp, tp->t_state)); /* * Nothing to do for ESTABLISHED or LISTEN states. And, we don't * know what to do for unexpected states (which includes TIME_WAIT). */ if (tp->t_state <= TCPS_LISTEN || tp->t_state >= TCPS_TIME_WAIT) return (0); /* * Make sure some kind of transmission timer is set if there is * outstanding data. */ so = tp->t_inpcb->inp_socket; if ((!TCPS_HAVEESTABLISHED(tp->t_state) || sbavail(&so->so_snd) || tp->snd_una != tp->snd_max) && !(tcp_timer_active(tp, TT_REXMT) || tcp_timer_active(tp, TT_PERSIST))) { /* * If the session has established and it looks like it should * be in the persist state, set the persist timer. Otherwise, * set the retransmit timer. */ if (TCPS_HAVEESTABLISHED(tp->t_state) && tp->snd_wnd == 0 && (int32_t)(tp->snd_nxt - tp->snd_una) < (int32_t)sbavail(&so->so_snd)) tcp_setpersist(tp); else tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur); } /* All non-embryonic sessions get a keepalive timer. */ if (!tcp_timer_active(tp, TT_KEEP)) tcp_timer_activate(tp, TT_KEEP, TCPS_HAVEESTABLISHED(tp->t_state) ? TP_KEEPIDLE(tp) : TP_KEEPINIT(tp)); /* * Make sure critical variables are initialized * if transitioning while in Recovery. */ if IN_FASTRECOVERY(tp->t_flags) { if (tp->sackhint.recover_fs == 0) tp->sackhint.recover_fs = max(1, tp->snd_nxt - tp->snd_una); } return (0); } /* * tfb_tcp_fb_fini() function for the default stack. * * This changes state as necessary (or prudent) to prepare for another stack * to assume responsibility for the connection. */ static void tcp_default_fb_fini(struct tcpcb *tp, int tcb_is_purged) { INP_WLOCK_ASSERT(tp->t_inpcb); return; } /* * Target size of TCP PCB hash tables. Must be a power of two. * * Note that this can be overridden by the kernel environment * variable net.inet.tcp.tcbhashsize */ #ifndef TCBHASHSIZE #define TCBHASHSIZE 0 #endif /* * XXX * Callouts should be moved into struct tcp directly. They are currently * separate because the tcpcb structure is exported to userland for sysctl * parsing purposes, which do not know about callouts. */ struct tcpcb_mem { struct tcpcb tcb; struct tcp_timer tt; struct cc_var ccv; #ifdef TCP_HHOOK struct osd osd; #endif }; VNET_DEFINE_STATIC(uma_zone_t, tcpcb_zone); #define V_tcpcb_zone VNET(tcpcb_zone) MALLOC_DEFINE(M_TCPLOG, "tcplog", "TCP address and flags print buffers"); MALLOC_DEFINE(M_TCPFUNCTIONS, "tcpfunc", "TCP function set memory"); static struct mtx isn_mtx; #define ISN_LOCK_INIT() mtx_init(&isn_mtx, "isn_mtx", NULL, MTX_DEF) #define ISN_LOCK() mtx_lock(&isn_mtx) #define ISN_UNLOCK() mtx_unlock(&isn_mtx) /* * TCP initialization. */ static void tcp_zone_change(void *tag) { uma_zone_set_max(V_tcbinfo.ipi_zone, maxsockets); uma_zone_set_max(V_tcpcb_zone, maxsockets); tcp_tw_zone_change(); } static int tcp_inpcb_init(void *mem, int size, int flags) { struct inpcb *inp = mem; INP_LOCK_INIT(inp, "inp", "tcpinp"); return (0); } /* * Take a value and get the next power of 2 that doesn't overflow. * Used to size the tcp_inpcb hash buckets. */ static int maketcp_hashsize(int size) { int hashsize; /* * auto tune. * get the next power of 2 higher than maxsockets. */ hashsize = 1 << fls(size); /* catch overflow, and just go one power of 2 smaller */ if (hashsize < size) { hashsize = 1 << (fls(size) - 1); } return (hashsize); } static volatile int next_tcp_stack_id = 1; /* * Register a TCP function block with the name provided in the names * array. (Note that this function does NOT automatically register * blk->tfb_tcp_block_name as a stack name. Therefore, you should * explicitly include blk->tfb_tcp_block_name in the list of names if * you wish to register the stack with that name.) * * Either all name registrations will succeed or all will fail. If * a name registration fails, the function will update the num_names * argument to point to the array index of the name that encountered * the failure. * * Returns 0 on success, or an error code on failure. */ int register_tcp_functions_as_names(struct tcp_function_block *blk, int wait, const char *names[], int *num_names) { struct tcp_function *n; struct tcp_function_set fs; int error, i; KASSERT(names != NULL && *num_names > 0, ("%s: Called with 0-length name list", __func__)); KASSERT(names != NULL, ("%s: Called with NULL name list", __func__)); KASSERT(rw_initialized(&tcp_function_lock), ("%s: called too early", __func__)); if ((blk->tfb_tcp_output == NULL) || (blk->tfb_tcp_do_segment == NULL) || (blk->tfb_tcp_ctloutput == NULL) || (strlen(blk->tfb_tcp_block_name) == 0)) { /* * These functions are required and you * need a name. */ *num_names = 0; return (EINVAL); } if (blk->tfb_tcp_timer_stop_all || blk->tfb_tcp_timer_activate || blk->tfb_tcp_timer_active || blk->tfb_tcp_timer_stop) { /* * If you define one timer function you * must have them all. */ if ((blk->tfb_tcp_timer_stop_all == NULL) || (blk->tfb_tcp_timer_activate == NULL) || (blk->tfb_tcp_timer_active == NULL) || (blk->tfb_tcp_timer_stop == NULL)) { *num_names = 0; return (EINVAL); } } if (blk->tfb_flags & TCP_FUNC_BEING_REMOVED) { *num_names = 0; return (EINVAL); } refcount_init(&blk->tfb_refcnt, 0); blk->tfb_id = atomic_fetchadd_int(&next_tcp_stack_id, 1); for (i = 0; i < *num_names; i++) { n = malloc(sizeof(struct tcp_function), M_TCPFUNCTIONS, wait); if (n == NULL) { error = ENOMEM; goto cleanup; } n->tf_fb = blk; (void)strlcpy(fs.function_set_name, names[i], sizeof(fs.function_set_name)); rw_wlock(&tcp_function_lock); if (find_tcp_functions_locked(&fs) != NULL) { /* Duplicate name space not allowed */ rw_wunlock(&tcp_function_lock); free(n, M_TCPFUNCTIONS); error = EALREADY; goto cleanup; } (void)strlcpy(n->tf_name, names[i], sizeof(n->tf_name)); TAILQ_INSERT_TAIL(&t_functions, n, tf_next); tcp_fb_cnt++; rw_wunlock(&tcp_function_lock); } return(0); cleanup: /* * Deregister the names we just added. Because registration failed * for names[i], we don't need to deregister that name. */ *num_names = i; rw_wlock(&tcp_function_lock); while (--i >= 0) { TAILQ_FOREACH(n, &t_functions, tf_next) { if (!strncmp(n->tf_name, names[i], TCP_FUNCTION_NAME_LEN_MAX)) { TAILQ_REMOVE(&t_functions, n, tf_next); tcp_fb_cnt--; n->tf_fb = NULL; free(n, M_TCPFUNCTIONS); break; } } } rw_wunlock(&tcp_function_lock); return (error); } /* * Register a TCP function block using the name provided in the name * argument. * * Returns 0 on success, or an error code on failure. */ int register_tcp_functions_as_name(struct tcp_function_block *blk, const char *name, int wait) { const char *name_list[1]; int num_names, rv; num_names = 1; if (name != NULL) name_list[0] = name; else name_list[0] = blk->tfb_tcp_block_name; rv = register_tcp_functions_as_names(blk, wait, name_list, &num_names); return (rv); } /* * Register a TCP function block using the name defined in * blk->tfb_tcp_block_name. * * Returns 0 on success, or an error code on failure. */ int register_tcp_functions(struct tcp_function_block *blk, int wait) { return (register_tcp_functions_as_name(blk, NULL, wait)); } /* * Deregister all names associated with a function block. This * functionally removes the function block from use within the system. * * When called with a true quiesce argument, mark the function block * as being removed so no more stacks will use it and determine * whether the removal would succeed. * * When called with a false quiesce argument, actually attempt the * removal. * * When called with a force argument, attempt to switch all TCBs to * use the default stack instead of returning EBUSY. * * Returns 0 on success (or if the removal would succeed, or an error * code on failure. */ int deregister_tcp_functions(struct tcp_function_block *blk, bool quiesce, bool force) { struct tcp_function *f; if (blk == &tcp_def_funcblk) { /* You can't un-register the default */ return (EPERM); } rw_wlock(&tcp_function_lock); if (blk == tcp_func_set_ptr) { /* You can't free the current default */ rw_wunlock(&tcp_function_lock); return (EBUSY); } /* Mark the block so no more stacks can use it. */ blk->tfb_flags |= TCP_FUNC_BEING_REMOVED; /* * If TCBs are still attached to the stack, attempt to switch them * to the default stack. */ if (force && blk->tfb_refcnt) { struct inpcb *inp; struct tcpcb *tp; VNET_ITERATOR_DECL(vnet_iter); rw_wunlock(&tcp_function_lock); VNET_LIST_RLOCK(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); INP_INFO_WLOCK(&V_tcbinfo); CK_LIST_FOREACH(inp, V_tcbinfo.ipi_listhead, inp_list) { INP_WLOCK(inp); if (inp->inp_flags & INP_TIMEWAIT) { INP_WUNLOCK(inp); continue; } tp = intotcpcb(inp); if (tp == NULL || tp->t_fb != blk) { INP_WUNLOCK(inp); continue; } tcp_switch_back_to_default(tp); INP_WUNLOCK(inp); } INP_INFO_WUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); } VNET_LIST_RUNLOCK(); rw_wlock(&tcp_function_lock); } if (blk->tfb_refcnt) { /* TCBs still attached. */ rw_wunlock(&tcp_function_lock); return (EBUSY); } if (quiesce) { /* Skip removal. */ rw_wunlock(&tcp_function_lock); return (0); } /* Remove any function names that map to this function block. */ while (find_tcp_fb_locked(blk, &f) != NULL) { TAILQ_REMOVE(&t_functions, f, tf_next); tcp_fb_cnt--; f->tf_fb = NULL; free(f, M_TCPFUNCTIONS); } rw_wunlock(&tcp_function_lock); return (0); } void tcp_init(void) { const char *tcbhash_tuneable; int hashsize; tcbhash_tuneable = "net.inet.tcp.tcbhashsize"; #ifdef TCP_HHOOK if (hhook_head_register(HHOOK_TYPE_TCP, HHOOK_TCP_EST_IN, &V_tcp_hhh[HHOOK_TCP_EST_IN], HHOOK_NOWAIT|HHOOK_HEADISINVNET) != 0) printf("%s: WARNING: unable to register helper hook\n", __func__); if (hhook_head_register(HHOOK_TYPE_TCP, HHOOK_TCP_EST_OUT, &V_tcp_hhh[HHOOK_TCP_EST_OUT], HHOOK_NOWAIT|HHOOK_HEADISINVNET) != 0) printf("%s: WARNING: unable to register helper hook\n", __func__); #endif #ifdef STATS if (tcp_stats_init()) printf("%s: WARNING: unable to initialise TCP stats\n", __func__); #endif hashsize = TCBHASHSIZE; TUNABLE_INT_FETCH(tcbhash_tuneable, &hashsize); if (hashsize == 0) { /* * Auto tune the hash size based on maxsockets. * A perfect hash would have a 1:1 mapping * (hashsize = maxsockets) however it's been * suggested that O(2) average is better. */ hashsize = maketcp_hashsize(maxsockets / 4); /* * Our historical default is 512, * do not autotune lower than this. */ if (hashsize < 512) hashsize = 512; if (bootverbose && IS_DEFAULT_VNET(curvnet)) printf("%s: %s auto tuned to %d\n", __func__, tcbhash_tuneable, hashsize); } /* * We require a hashsize to be a power of two. * Previously if it was not a power of two we would just reset it * back to 512, which could be a nasty surprise if you did not notice * the error message. * Instead what we do is clip it to the closest power of two lower * than the specified hash value. */ if (!powerof2(hashsize)) { int oldhashsize = hashsize; hashsize = maketcp_hashsize(hashsize); /* prevent absurdly low value */ if (hashsize < 16) hashsize = 16; printf("%s: WARNING: TCB hash size not a power of 2, " "clipped from %d to %d.\n", __func__, oldhashsize, hashsize); } in_pcbinfo_init(&V_tcbinfo, "tcp", &V_tcb, hashsize, hashsize, "tcp_inpcb", tcp_inpcb_init, IPI_HASHFIELDS_4TUPLE); /* * These have to be type stable for the benefit of the timers. */ V_tcpcb_zone = uma_zcreate("tcpcb", sizeof(struct tcpcb_mem), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); uma_zone_set_max(V_tcpcb_zone, maxsockets); uma_zone_set_warning(V_tcpcb_zone, "kern.ipc.maxsockets limit reached"); tcp_tw_init(); syncache_init(); tcp_hc_init(); TUNABLE_INT_FETCH("net.inet.tcp.sack.enable", &V_tcp_do_sack); V_sack_hole_zone = uma_zcreate("sackhole", sizeof(struct sackhole), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); tcp_fastopen_init(); /* Skip initialization of globals for non-default instances. */ if (!IS_DEFAULT_VNET(curvnet)) return; tcp_reass_global_init(); /* XXX virtualize those bellow? */ tcp_delacktime = TCPTV_DELACK; tcp_keepinit = TCPTV_KEEP_INIT; tcp_keepidle = TCPTV_KEEP_IDLE; tcp_keepintvl = TCPTV_KEEPINTVL; tcp_maxpersistidle = TCPTV_KEEP_IDLE; tcp_msl = TCPTV_MSL; tcp_rexmit_initial = TCPTV_RTOBASE; if (tcp_rexmit_initial < 1) tcp_rexmit_initial = 1; tcp_rexmit_min = TCPTV_MIN; if (tcp_rexmit_min < 1) tcp_rexmit_min = 1; tcp_persmin = TCPTV_PERSMIN; tcp_persmax = TCPTV_PERSMAX; tcp_rexmit_slop = TCPTV_CPU_VAR; tcp_finwait2_timeout = TCPTV_FINWAIT2_TIMEOUT; tcp_tcbhashsize = hashsize; /* Setup the tcp function block list */ TAILQ_INIT(&t_functions); rw_init(&tcp_function_lock, "tcp_func_lock"); register_tcp_functions(&tcp_def_funcblk, M_WAITOK); #ifdef TCP_BLACKBOX /* Initialize the TCP logging data. */ tcp_log_init(); #endif arc4rand(&V_ts_offset_secret, sizeof(V_ts_offset_secret), 0); if (tcp_soreceive_stream) { #ifdef INET tcp_usrreqs.pru_soreceive = soreceive_stream; #endif #ifdef INET6 tcp6_usrreqs.pru_soreceive = soreceive_stream; #endif /* INET6 */ } #ifdef INET6 #define TCP_MINPROTOHDR (sizeof(struct ip6_hdr) + sizeof(struct tcphdr)) #else /* INET6 */ #define TCP_MINPROTOHDR (sizeof(struct tcpiphdr)) #endif /* INET6 */ if (max_protohdr < TCP_MINPROTOHDR) max_protohdr = TCP_MINPROTOHDR; if (max_linkhdr + TCP_MINPROTOHDR > MHLEN) panic("tcp_init"); #undef TCP_MINPROTOHDR ISN_LOCK_INIT(); EVENTHANDLER_REGISTER(shutdown_pre_sync, tcp_fini, NULL, SHUTDOWN_PRI_DEFAULT); EVENTHANDLER_REGISTER(maxsockets_change, tcp_zone_change, NULL, EVENTHANDLER_PRI_ANY); tcp_inp_lro_direct_queue = counter_u64_alloc(M_WAITOK); tcp_inp_lro_wokeup_queue = counter_u64_alloc(M_WAITOK); tcp_inp_lro_compressed = counter_u64_alloc(M_WAITOK); tcp_inp_lro_locks_taken = counter_u64_alloc(M_WAITOK); tcp_extra_mbuf = counter_u64_alloc(M_WAITOK); tcp_would_have_but = counter_u64_alloc(M_WAITOK); tcp_comp_total = counter_u64_alloc(M_WAITOK); tcp_uncomp_total = counter_u64_alloc(M_WAITOK); #ifdef TCPPCAP tcp_pcap_init(); #endif } #ifdef VIMAGE static void tcp_destroy(void *unused __unused) { int n; #ifdef TCP_HHOOK int error; #endif /* * All our processes are gone, all our sockets should be cleaned * up, which means, we should be past the tcp_discardcb() calls. * Sleep to let all tcpcb timers really disappear and cleanup. */ for (;;) { INP_LIST_RLOCK(&V_tcbinfo); n = V_tcbinfo.ipi_count; INP_LIST_RUNLOCK(&V_tcbinfo); if (n == 0) break; pause("tcpdes", hz / 10); } tcp_hc_destroy(); syncache_destroy(); tcp_tw_destroy(); in_pcbinfo_destroy(&V_tcbinfo); /* tcp_discardcb() clears the sack_holes up. */ uma_zdestroy(V_sack_hole_zone); uma_zdestroy(V_tcpcb_zone); /* * Cannot free the zone until all tcpcbs are released as we attach * the allocations to them. */ tcp_fastopen_destroy(); #ifdef TCP_HHOOK error = hhook_head_deregister(V_tcp_hhh[HHOOK_TCP_EST_IN]); if (error != 0) { printf("%s: WARNING: unable to deregister helper hook " "type=%d, id=%d: error %d returned\n", __func__, HHOOK_TYPE_TCP, HHOOK_TCP_EST_IN, error); } error = hhook_head_deregister(V_tcp_hhh[HHOOK_TCP_EST_OUT]); if (error != 0) { printf("%s: WARNING: unable to deregister helper hook " "type=%d, id=%d: error %d returned\n", __func__, HHOOK_TYPE_TCP, HHOOK_TCP_EST_OUT, error); } #endif } VNET_SYSUNINIT(tcp, SI_SUB_PROTO_DOMAIN, SI_ORDER_FOURTH, tcp_destroy, NULL); #endif void tcp_fini(void *xtp) { } /* * Fill in the IP and TCP headers for an outgoing packet, given the tcpcb. * tcp_template used to store this data in mbufs, but we now recopy it out * of the tcpcb each time to conserve mbufs. */ void tcpip_fillheaders(struct inpcb *inp, uint16_t port, void *ip_ptr, void *tcp_ptr) { struct tcphdr *th = (struct tcphdr *)tcp_ptr; INP_WLOCK_ASSERT(inp); #ifdef INET6 if ((inp->inp_vflag & INP_IPV6) != 0) { struct ip6_hdr *ip6; ip6 = (struct ip6_hdr *)ip_ptr; ip6->ip6_flow = (ip6->ip6_flow & ~IPV6_FLOWINFO_MASK) | (inp->inp_flow & IPV6_FLOWINFO_MASK); ip6->ip6_vfc = (ip6->ip6_vfc & ~IPV6_VERSION_MASK) | (IPV6_VERSION & IPV6_VERSION_MASK); if (port == 0) ip6->ip6_nxt = IPPROTO_TCP; else ip6->ip6_nxt = IPPROTO_UDP; ip6->ip6_plen = htons(sizeof(struct tcphdr)); ip6->ip6_src = inp->in6p_laddr; ip6->ip6_dst = inp->in6p_faddr; } #endif /* INET6 */ #if defined(INET6) && defined(INET) else #endif #ifdef INET { struct ip *ip; ip = (struct ip *)ip_ptr; ip->ip_v = IPVERSION; ip->ip_hl = 5; ip->ip_tos = inp->inp_ip_tos; ip->ip_len = 0; ip->ip_id = 0; ip->ip_off = 0; ip->ip_ttl = inp->inp_ip_ttl; ip->ip_sum = 0; if (port == 0) ip->ip_p = IPPROTO_TCP; else ip->ip_p = IPPROTO_UDP; ip->ip_src = inp->inp_laddr; ip->ip_dst = inp->inp_faddr; } #endif /* INET */ th->th_sport = inp->inp_lport; th->th_dport = inp->inp_fport; th->th_seq = 0; th->th_ack = 0; th->th_x2 = 0; th->th_off = 5; th->th_flags = 0; th->th_win = 0; th->th_urp = 0; th->th_sum = 0; /* in_pseudo() is called later for ipv4 */ } /* * Create template to be used to send tcp packets on a connection. * Allocates an mbuf and fills in a skeletal tcp/ip header. The only * use for this function is in keepalives, which use tcp_respond. */ struct tcptemp * tcpip_maketemplate(struct inpcb *inp) { struct tcptemp *t; t = malloc(sizeof(*t), M_TEMP, M_NOWAIT); if (t == NULL) return (NULL); tcpip_fillheaders(inp, 0, (void *)&t->tt_ipgen, (void *)&t->tt_t); return (t); } /* * Send a single message to the TCP at address specified by * the given TCP/IP header. If m == NULL, then we make a copy * of the tcpiphdr at th and send directly to the addressed host. * This is used to force keep alive messages out using the TCP * template for a connection. If flags are given then we send * a message back to the TCP which originated the segment th, * and discard the mbuf containing it and any other attached mbufs. * * In any case the ack and sequence number of the transmitted * segment are as specified by the parameters. * * NOTE: If m != NULL, then th must point to *inside* the mbuf. */ void tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m, tcp_seq ack, tcp_seq seq, int flags) { struct tcpopt to; struct inpcb *inp; struct ip *ip; struct mbuf *optm; struct udphdr *uh = NULL; struct tcphdr *nth; u_char *optp; #ifdef INET6 struct ip6_hdr *ip6; int isipv6; #endif /* INET6 */ int optlen, tlen, win, ulen; bool incl_opts; uint16_t port; KASSERT(tp != NULL || m != NULL, ("tcp_respond: tp and m both NULL")); NET_EPOCH_ASSERT(); #ifdef INET6 isipv6 = ((struct ip *)ipgen)->ip_v == (IPV6_VERSION >> 4); ip6 = ipgen; #endif /* INET6 */ ip = ipgen; if (tp != NULL) { inp = tp->t_inpcb; KASSERT(inp != NULL, ("tcp control block w/o inpcb")); INP_LOCK_ASSERT(inp); } else inp = NULL; if (m != NULL) { #ifdef INET6 if (isipv6 && ip6 && (ip6->ip6_nxt == IPPROTO_UDP)) port = m->m_pkthdr.tcp_tun_port; else #endif if (ip && (ip->ip_p == IPPROTO_UDP)) port = m->m_pkthdr.tcp_tun_port; else port = 0; } else port = tp->t_port; incl_opts = false; win = 0; if (tp != NULL) { if (!(flags & TH_RST)) { win = sbspace(&inp->inp_socket->so_rcv); if (win > TCP_MAXWIN << tp->rcv_scale) win = TCP_MAXWIN << tp->rcv_scale; } if ((tp->t_flags & TF_NOOPT) == 0) incl_opts = true; } if (m == NULL) { m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) return; m->m_data += max_linkhdr; #ifdef INET6 if (isipv6) { bcopy((caddr_t)ip6, mtod(m, caddr_t), sizeof(struct ip6_hdr)); ip6 = mtod(m, struct ip6_hdr *); nth = (struct tcphdr *)(ip6 + 1); if (port) { /* Insert a UDP header */ uh = (struct udphdr *)nth; uh->uh_sport = htons(V_tcp_udp_tunneling_port); uh->uh_dport = port; nth = (struct tcphdr *)(uh + 1); } } else #endif /* INET6 */ { bcopy((caddr_t)ip, mtod(m, caddr_t), sizeof(struct ip)); ip = mtod(m, struct ip *); nth = (struct tcphdr *)(ip + 1); if (port) { /* Insert a UDP header */ uh = (struct udphdr *)nth; uh->uh_sport = htons(V_tcp_udp_tunneling_port); uh->uh_dport = port; nth = (struct tcphdr *)(uh + 1); } } bcopy((caddr_t)th, (caddr_t)nth, sizeof(struct tcphdr)); flags = TH_ACK; } else if ((!M_WRITABLE(m)) || (port != 0)) { struct mbuf *n; /* Can't reuse 'm', allocate a new mbuf. */ n = m_gethdr(M_NOWAIT, MT_DATA); if (n == NULL) { m_freem(m); return; } if (!m_dup_pkthdr(n, m, M_NOWAIT)) { m_freem(m); m_freem(n); return; } n->m_data += max_linkhdr; /* m_len is set later */ #define xchg(a,b,type) { type t; t=a; a=b; b=t; } #ifdef INET6 if (isipv6) { bcopy((caddr_t)ip6, mtod(n, caddr_t), sizeof(struct ip6_hdr)); ip6 = mtod(n, struct ip6_hdr *); xchg(ip6->ip6_dst, ip6->ip6_src, struct in6_addr); nth = (struct tcphdr *)(ip6 + 1); if (port) { /* Insert a UDP header */ uh = (struct udphdr *)nth; uh->uh_sport = htons(V_tcp_udp_tunneling_port); uh->uh_dport = port; nth = (struct tcphdr *)(uh + 1); } } else #endif /* INET6 */ { bcopy((caddr_t)ip, mtod(n, caddr_t), sizeof(struct ip)); ip = mtod(n, struct ip *); xchg(ip->ip_dst.s_addr, ip->ip_src.s_addr, uint32_t); nth = (struct tcphdr *)(ip + 1); if (port) { /* Insert a UDP header */ uh = (struct udphdr *)nth; uh->uh_sport = htons(V_tcp_udp_tunneling_port); uh->uh_dport = port; nth = (struct tcphdr *)(uh + 1); } } bcopy((caddr_t)th, (caddr_t)nth, sizeof(struct tcphdr)); xchg(nth->th_dport, nth->th_sport, uint16_t); th = nth; m_freem(m); m = n; } else { /* * reuse the mbuf. * XXX MRT We inherit the FIB, which is lucky. */ m_freem(m->m_next); m->m_next = NULL; m->m_data = (caddr_t)ipgen; /* m_len is set later */ #ifdef INET6 if (isipv6) { xchg(ip6->ip6_dst, ip6->ip6_src, struct in6_addr); nth = (struct tcphdr *)(ip6 + 1); } else #endif /* INET6 */ { xchg(ip->ip_dst.s_addr, ip->ip_src.s_addr, uint32_t); nth = (struct tcphdr *)(ip + 1); } if (th != nth) { /* * this is usually a case when an extension header * exists between the IPv6 header and the * TCP header. */ nth->th_sport = th->th_sport; nth->th_dport = th->th_dport; } xchg(nth->th_dport, nth->th_sport, uint16_t); #undef xchg } tlen = 0; #ifdef INET6 if (isipv6) tlen = sizeof (struct ip6_hdr) + sizeof (struct tcphdr); #endif #if defined(INET) && defined(INET6) else #endif #ifdef INET tlen = sizeof (struct tcpiphdr); #endif if (port) tlen += sizeof (struct udphdr); #ifdef INVARIANTS m->m_len = 0; KASSERT(M_TRAILINGSPACE(m) >= tlen, ("Not enough trailing space for message (m=%p, need=%d, have=%ld)", m, tlen, (long)M_TRAILINGSPACE(m))); #endif m->m_len = tlen; to.to_flags = 0; if (incl_opts) { /* Make sure we have room. */ if (M_TRAILINGSPACE(m) < TCP_MAXOLEN) { m->m_next = m_get(M_NOWAIT, MT_DATA); if (m->m_next) { optp = mtod(m->m_next, u_char *); optm = m->m_next; } else incl_opts = false; } else { optp = (u_char *) (nth + 1); optm = m; } } if (incl_opts) { /* Timestamps. */ if (tp->t_flags & TF_RCVD_TSTMP) { to.to_tsval = tcp_ts_getticks() + tp->ts_offset; to.to_tsecr = tp->ts_recent; to.to_flags |= TOF_TS; } #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) /* TCP-MD5 (RFC2385). */ if (tp->t_flags & TF_SIGNATURE) to.to_flags |= TOF_SIGNATURE; #endif /* Add the options. */ tlen += optlen = tcp_addoptions(&to, optp); /* Update m_len in the correct mbuf. */ optm->m_len += optlen; } else optlen = 0; #ifdef INET6 if (isipv6) { if (uh) { ulen = tlen - sizeof(struct ip6_hdr); uh->uh_ulen = htons(ulen); } ip6->ip6_flow = 0; ip6->ip6_vfc = IPV6_VERSION; if (port) ip6->ip6_nxt = IPPROTO_UDP; else ip6->ip6_nxt = IPPROTO_TCP; ip6->ip6_plen = htons(tlen - sizeof(*ip6)); } #endif #if defined(INET) && defined(INET6) else #endif #ifdef INET { if (uh) { ulen = tlen - sizeof(struct ip); uh->uh_ulen = htons(ulen); } ip->ip_len = htons(tlen); ip->ip_ttl = V_ip_defttl; if (port) { ip->ip_p = IPPROTO_UDP; } else { ip->ip_p = IPPROTO_TCP; } if (V_path_mtu_discovery) ip->ip_off |= htons(IP_DF); } #endif m->m_pkthdr.len = tlen; m->m_pkthdr.rcvif = NULL; #ifdef MAC if (inp != NULL) { /* * Packet is associated with a socket, so allow the * label of the response to reflect the socket label. */ INP_LOCK_ASSERT(inp); mac_inpcb_create_mbuf(inp, m); } else { /* * Packet is not associated with a socket, so possibly * update the label in place. */ mac_netinet_tcp_reply(m); } #endif nth->th_seq = htonl(seq); nth->th_ack = htonl(ack); nth->th_x2 = 0; nth->th_off = (sizeof (struct tcphdr) + optlen) >> 2; nth->th_flags = flags; if (tp != NULL) nth->th_win = htons((u_short) (win >> tp->rcv_scale)); else nth->th_win = htons((u_short)win); nth->th_urp = 0; #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) if (to.to_flags & TOF_SIGNATURE) { if (!TCPMD5_ENABLED() || TCPMD5_OUTPUT(m, nth, to.to_signature) != 0) { m_freem(m); return; } } #endif #ifdef INET6 if (isipv6) { if (port) { m->m_pkthdr.csum_flags = CSUM_UDP_IPV6; m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); uh->uh_sum = in6_cksum_pseudo(ip6, ulen, IPPROTO_UDP, 0); nth->th_sum = 0; } else { m->m_pkthdr.csum_flags = CSUM_TCP_IPV6; m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); nth->th_sum = in6_cksum_pseudo(ip6, tlen - sizeof(struct ip6_hdr), IPPROTO_TCP, 0); } ip6->ip6_hlim = in6_selecthlim(tp != NULL ? tp->t_inpcb : NULL, NULL); } #endif /* INET6 */ #if defined(INET6) && defined(INET) else #endif #ifdef INET { if (port) { uh->uh_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htons(ulen + IPPROTO_UDP)); m->m_pkthdr.csum_flags = CSUM_UDP; m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); nth->th_sum = 0; } else { m->m_pkthdr.csum_flags = CSUM_TCP; m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); nth->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htons((u_short)(tlen - sizeof(struct ip) + ip->ip_p))); } } #endif /* INET */ #ifdef TCPDEBUG if (tp == NULL || (inp->inp_socket->so_options & SO_DEBUG)) tcp_trace(TA_OUTPUT, 0, tp, mtod(m, void *), th, 0); #endif TCP_PROBE3(debug__output, tp, th, m); if (flags & TH_RST) TCP_PROBE5(accept__refused, NULL, NULL, m, tp, nth); #ifdef INET6 if (isipv6) { TCP_PROBE5(send, NULL, tp, ip6, tp, nth); (void)ip6_output(m, NULL, NULL, 0, NULL, NULL, inp); } #endif /* INET6 */ #if defined(INET) && defined(INET6) else #endif #ifdef INET { TCP_PROBE5(send, NULL, tp, ip, tp, nth); (void)ip_output(m, NULL, NULL, 0, NULL, inp); } #endif } /* * Create a new TCP control block, making an * empty reassembly queue and hooking it to the argument * protocol control block. The `inp' parameter must have * come from the zone allocator set up in tcp_init(). */ struct tcpcb * tcp_newtcpcb(struct inpcb *inp) { struct tcpcb_mem *tm; struct tcpcb *tp; #ifdef INET6 int isipv6 = (inp->inp_vflag & INP_IPV6) != 0; #endif /* INET6 */ tm = uma_zalloc(V_tcpcb_zone, M_NOWAIT | M_ZERO); if (tm == NULL) return (NULL); tp = &tm->tcb; /* Initialise cc_var struct for this tcpcb. */ tp->ccv = &tm->ccv; tp->ccv->type = IPPROTO_TCP; tp->ccv->ccvc.tcp = tp; rw_rlock(&tcp_function_lock); tp->t_fb = tcp_func_set_ptr; refcount_acquire(&tp->t_fb->tfb_refcnt); rw_runlock(&tcp_function_lock); /* * Use the current system default CC algorithm. */ CC_LIST_RLOCK(); KASSERT(!STAILQ_EMPTY(&cc_list), ("cc_list is empty!")); CC_ALGO(tp) = CC_DEFAULT(); CC_LIST_RUNLOCK(); /* * The tcpcb will hold a reference on its inpcb until tcp_discardcb() * is called. */ in_pcbref(inp); /* Reference for tcpcb */ tp->t_inpcb = inp; if (CC_ALGO(tp)->cb_init != NULL) if (CC_ALGO(tp)->cb_init(tp->ccv) > 0) { if (tp->t_fb->tfb_tcp_fb_fini) (*tp->t_fb->tfb_tcp_fb_fini)(tp, 1); in_pcbrele_wlocked(inp); refcount_release(&tp->t_fb->tfb_refcnt); uma_zfree(V_tcpcb_zone, tm); return (NULL); } #ifdef TCP_HHOOK tp->osd = &tm->osd; if (khelp_init_osd(HELPER_CLASS_TCP, tp->osd)) { if (tp->t_fb->tfb_tcp_fb_fini) (*tp->t_fb->tfb_tcp_fb_fini)(tp, 1); in_pcbrele_wlocked(inp); refcount_release(&tp->t_fb->tfb_refcnt); uma_zfree(V_tcpcb_zone, tm); return (NULL); } #endif #ifdef VIMAGE tp->t_vnet = inp->inp_vnet; #endif tp->t_timers = &tm->tt; TAILQ_INIT(&tp->t_segq); tp->t_maxseg = #ifdef INET6 isipv6 ? V_tcp_v6mssdflt : #endif /* INET6 */ V_tcp_mssdflt; /* Set up our timeouts. */ callout_init(&tp->t_timers->tt_rexmt, 1); callout_init(&tp->t_timers->tt_persist, 1); callout_init(&tp->t_timers->tt_keep, 1); callout_init(&tp->t_timers->tt_2msl, 1); callout_init(&tp->t_timers->tt_delack, 1); if (V_tcp_do_rfc1323) tp->t_flags = (TF_REQ_SCALE|TF_REQ_TSTMP); if (V_tcp_do_sack) tp->t_flags |= TF_SACK_PERMIT; TAILQ_INIT(&tp->snd_holes); /* * Init srtt to TCPTV_SRTTBASE (0), so we can tell that we have no * rtt estimate. Set rttvar so that srtt + 4 * rttvar gives * reasonable initial retransmit time. */ tp->t_srtt = TCPTV_SRTTBASE; tp->t_rttvar = ((tcp_rexmit_initial - TCPTV_SRTTBASE) << TCP_RTTVAR_SHIFT) / 4; tp->t_rttmin = tcp_rexmit_min; tp->t_rxtcur = tcp_rexmit_initial; tp->snd_cwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT; tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT; tp->t_rcvtime = ticks; /* * IPv4 TTL initialization is necessary for an IPv6 socket as well, * because the socket may be bound to an IPv6 wildcard address, * which may match an IPv4-mapped IPv6 address. */ inp->inp_ip_ttl = V_ip_defttl; inp->inp_ppcb = tp; #ifdef TCPPCAP /* * Init the TCP PCAP queues. */ tcp_pcap_tcpcb_init(tp); #endif #ifdef TCP_BLACKBOX /* Initialize the per-TCPCB log data. */ tcp_log_tcpcbinit(tp); #endif tp->t_pacing_rate = -1; if (tp->t_fb->tfb_tcp_fb_init) { if ((*tp->t_fb->tfb_tcp_fb_init)(tp)) { refcount_release(&tp->t_fb->tfb_refcnt); in_pcbrele_wlocked(inp); uma_zfree(V_tcpcb_zone, tm); return (NULL); } } #ifdef STATS if (V_tcp_perconn_stats_enable == 1) tp->t_stats = stats_blob_alloc(V_tcp_perconn_stats_dflt_tpl, 0); #endif if (V_tcp_do_lrd) tp->t_flags |= TF_LRD; return (tp); /* XXX */ } /* * Switch the congestion control algorithm back to NewReno for any active * control blocks using an algorithm which is about to go away. * This ensures the CC framework can allow the unload to proceed without leaving * any dangling pointers which would trigger a panic. * Returning non-zero would inform the CC framework that something went wrong * and it would be unsafe to allow the unload to proceed. However, there is no * way for this to occur with this implementation so we always return zero. */ int tcp_ccalgounload(struct cc_algo *unload_algo) { struct cc_algo *tmpalgo; struct inpcb *inp; struct tcpcb *tp; VNET_ITERATOR_DECL(vnet_iter); /* * Check all active control blocks across all network stacks and change * any that are using "unload_algo" back to NewReno. If "unload_algo" * requires cleanup code to be run, call it. */ VNET_LIST_RLOCK(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); INP_INFO_WLOCK(&V_tcbinfo); /* * New connections already part way through being initialised * with the CC algo we're removing will not race with this code * because the INP_INFO_WLOCK is held during initialisation. We * therefore don't enter the loop below until the connection * list has stabilised. */ CK_LIST_FOREACH(inp, &V_tcb, inp_list) { INP_WLOCK(inp); /* Important to skip tcptw structs. */ if (!(inp->inp_flags & INP_TIMEWAIT) && (tp = intotcpcb(inp)) != NULL) { /* * By holding INP_WLOCK here, we are assured * that the connection is not currently * executing inside the CC module's functions * i.e. it is safe to make the switch back to * NewReno. */ if (CC_ALGO(tp) == unload_algo) { tmpalgo = CC_ALGO(tp); if (tmpalgo->cb_destroy != NULL) tmpalgo->cb_destroy(tp->ccv); CC_DATA(tp) = NULL; /* * NewReno may allocate memory on * demand for certain stateful * configuration as needed, but is * coded to never fail on memory * allocation failure so it is a safe * fallback. */ CC_ALGO(tp) = &newreno_cc_algo; } } INP_WUNLOCK(inp); } INP_INFO_WUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); } VNET_LIST_RUNLOCK(); return (0); } /* * Drop a TCP connection, reporting * the specified error. If connection is synchronized, * then send a RST to peer. */ struct tcpcb * tcp_drop(struct tcpcb *tp, int errno) { struct socket *so = tp->t_inpcb->inp_socket; NET_EPOCH_ASSERT(); INP_INFO_LOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(tp->t_inpcb); if (TCPS_HAVERCVDSYN(tp->t_state)) { tcp_state_change(tp, TCPS_CLOSED); (void) tp->t_fb->tfb_tcp_output(tp); TCPSTAT_INC(tcps_drops); } else TCPSTAT_INC(tcps_conndrops); if (errno == ETIMEDOUT && tp->t_softerror) errno = tp->t_softerror; so->so_error = errno; return (tcp_close(tp)); } void tcp_discardcb(struct tcpcb *tp) { struct inpcb *inp = tp->t_inpcb; struct socket *so = inp->inp_socket; #ifdef INET6 int isipv6 = (inp->inp_vflag & INP_IPV6) != 0; #endif /* INET6 */ int released __unused; INP_WLOCK_ASSERT(inp); /* * Make sure that all of our timers are stopped before we delete the * PCB. * * If stopping a timer fails, we schedule a discard function in same * callout, and the last discard function called will take care of * deleting the tcpcb. */ tp->t_timers->tt_draincnt = 0; tcp_timer_stop(tp, TT_REXMT); tcp_timer_stop(tp, TT_PERSIST); tcp_timer_stop(tp, TT_KEEP); tcp_timer_stop(tp, TT_2MSL); tcp_timer_stop(tp, TT_DELACK); if (tp->t_fb->tfb_tcp_timer_stop_all) { /* * Call the stop-all function of the methods, * this function should call the tcp_timer_stop() * method with each of the function specific timeouts. * That stop will be called via the tfb_tcp_timer_stop() * which should use the async drain function of the * callout system (see tcp_var.h). */ tp->t_fb->tfb_tcp_timer_stop_all(tp); } /* free the reassembly queue, if any */ tcp_reass_flush(tp); #ifdef TCP_OFFLOAD /* Disconnect offload device, if any. */ if (tp->t_flags & TF_TOE) tcp_offload_detach(tp); #endif tcp_free_sackholes(tp); #ifdef TCPPCAP /* Free the TCP PCAP queues. */ tcp_pcap_drain(&(tp->t_inpkts)); tcp_pcap_drain(&(tp->t_outpkts)); #endif /* Allow the CC algorithm to clean up after itself. */ if (CC_ALGO(tp)->cb_destroy != NULL) CC_ALGO(tp)->cb_destroy(tp->ccv); CC_DATA(tp) = NULL; #ifdef TCP_HHOOK khelp_destroy_osd(tp->osd); #endif #ifdef STATS stats_blob_destroy(tp->t_stats); #endif CC_ALGO(tp) = NULL; inp->inp_ppcb = NULL; if (tp->t_timers->tt_draincnt == 0) { /* We own the last reference on tcpcb, let's free it. */ #ifdef TCP_BLACKBOX tcp_log_tcpcbfini(tp); #endif TCPSTATES_DEC(tp->t_state); if (tp->t_fb->tfb_tcp_fb_fini) (*tp->t_fb->tfb_tcp_fb_fini)(tp, 1); /* * If we got enough samples through the srtt filter, * save the rtt and rttvar in the routing entry. * 'Enough' is arbitrarily defined as 4 rtt samples. * 4 samples is enough for the srtt filter to converge * to within enough % of the correct value; fewer samples * and we could save a bogus rtt. The danger is not high * as tcp quickly recovers from everything. * XXX: Works very well but needs some more statistics! * * XXXRRS: Updating must be after the stack fini() since * that may be converting some internal representation of * say srtt etc into the general one used by other stacks. * Lets also at least protect against the so being NULL * as RW stated below. */ if ((tp->t_rttupdated >= 4) && (so != NULL)) { struct hc_metrics_lite metrics; uint32_t ssthresh; bzero(&metrics, sizeof(metrics)); /* * Update the ssthresh always when the conditions below * are satisfied. This gives us better new start value * for the congestion avoidance for new connections. * ssthresh is only set if packet loss occurred on a session. * * XXXRW: 'so' may be NULL here, and/or socket buffer may be * being torn down. Ideally this code would not use 'so'. */ ssthresh = tp->snd_ssthresh; if (ssthresh != 0 && ssthresh < so->so_snd.sb_hiwat / 2) { /* * convert the limit from user data bytes to * packets then to packet data bytes. */ ssthresh = (ssthresh + tp->t_maxseg / 2) / tp->t_maxseg; if (ssthresh < 2) ssthresh = 2; ssthresh *= (tp->t_maxseg + #ifdef INET6 (isipv6 ? sizeof (struct ip6_hdr) + sizeof (struct tcphdr) : #endif sizeof (struct tcpiphdr) #ifdef INET6 ) #endif ); } else ssthresh = 0; metrics.rmx_ssthresh = ssthresh; metrics.rmx_rtt = tp->t_srtt; metrics.rmx_rttvar = tp->t_rttvar; metrics.rmx_cwnd = tp->snd_cwnd; metrics.rmx_sendpipe = 0; metrics.rmx_recvpipe = 0; tcp_hc_update(&inp->inp_inc, &metrics); } refcount_release(&tp->t_fb->tfb_refcnt); tp->t_inpcb = NULL; uma_zfree(V_tcpcb_zone, tp); released = in_pcbrele_wlocked(inp); KASSERT(!released, ("%s: inp %p should not have been released " "here", __func__, inp)); } } void tcp_timer_discard(void *ptp) { struct inpcb *inp; struct tcpcb *tp; struct epoch_tracker et; tp = (struct tcpcb *)ptp; CURVNET_SET(tp->t_vnet); NET_EPOCH_ENTER(et); inp = tp->t_inpcb; KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); INP_WLOCK(inp); KASSERT((tp->t_timers->tt_flags & TT_STOPPED) != 0, ("%s: tcpcb has to be stopped here", __func__)); tp->t_timers->tt_draincnt--; if (tp->t_timers->tt_draincnt == 0) { /* We own the last reference on this tcpcb, let's free it. */ #ifdef TCP_BLACKBOX tcp_log_tcpcbfini(tp); #endif TCPSTATES_DEC(tp->t_state); if (tp->t_fb->tfb_tcp_fb_fini) (*tp->t_fb->tfb_tcp_fb_fini)(tp, 1); refcount_release(&tp->t_fb->tfb_refcnt); tp->t_inpcb = NULL; uma_zfree(V_tcpcb_zone, tp); if (in_pcbrele_wlocked(inp)) { NET_EPOCH_EXIT(et); CURVNET_RESTORE(); return; } } INP_WUNLOCK(inp); NET_EPOCH_EXIT(et); CURVNET_RESTORE(); } /* * Attempt to close a TCP control block, marking it as dropped, and freeing * the socket if we hold the only reference. */ struct tcpcb * tcp_close(struct tcpcb *tp) { struct inpcb *inp = tp->t_inpcb; struct socket *so; INP_INFO_LOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(inp); #ifdef TCP_OFFLOAD if (tp->t_state == TCPS_LISTEN) tcp_offload_listen_stop(tp); #endif /* * This releases the TFO pending counter resource for TFO listen * sockets as well as passively-created TFO sockets that transition * from SYN_RECEIVED to CLOSED. */ if (tp->t_tfo_pending) { tcp_fastopen_decrement_counter(tp->t_tfo_pending); tp->t_tfo_pending = NULL; } in_pcbdrop(inp); TCPSTAT_INC(tcps_closed); if (tp->t_state != TCPS_CLOSED) tcp_state_change(tp, TCPS_CLOSED); KASSERT(inp->inp_socket != NULL, ("tcp_close: inp_socket NULL")); so = inp->inp_socket; soisdisconnected(so); if (inp->inp_flags & INP_SOCKREF) { KASSERT(so->so_state & SS_PROTOREF, ("tcp_close: !SS_PROTOREF")); inp->inp_flags &= ~INP_SOCKREF; INP_WUNLOCK(inp); SOCK_LOCK(so); so->so_state &= ~SS_PROTOREF; sofree(so); return (NULL); } return (tp); } void tcp_drain(void) { VNET_ITERATOR_DECL(vnet_iter); if (!do_tcpdrain) return; VNET_LIST_RLOCK_NOSLEEP(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); struct inpcb *inpb; struct tcpcb *tcpb; /* * Walk the tcpbs, if existing, and flush the reassembly queue, * if there is one... * XXX: The "Net/3" implementation doesn't imply that the TCP * reassembly queue should be flushed, but in a situation * where we're really low on mbufs, this is potentially * useful. */ INP_INFO_WLOCK(&V_tcbinfo); CK_LIST_FOREACH(inpb, V_tcbinfo.ipi_listhead, inp_list) { INP_WLOCK(inpb); if (inpb->inp_flags & INP_TIMEWAIT) { INP_WUNLOCK(inpb); continue; } if ((tcpb = intotcpcb(inpb)) != NULL) { tcp_reass_flush(tcpb); tcp_clean_sackreport(tcpb); #ifdef TCP_BLACKBOX tcp_log_drain(tcpb); #endif #ifdef TCPPCAP if (tcp_pcap_aggressive_free) { /* Free the TCP PCAP queues. */ tcp_pcap_drain(&(tcpb->t_inpkts)); tcp_pcap_drain(&(tcpb->t_outpkts)); } #endif } INP_WUNLOCK(inpb); } INP_INFO_WUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); } VNET_LIST_RUNLOCK_NOSLEEP(); } /* * Notify a tcp user of an asynchronous error; * store error as soft error, but wake up user * (for now, won't do anything until can select for soft error). * * Do not wake up user since there currently is no mechanism for * reporting soft errors (yet - a kqueue filter may be added). */ static struct inpcb * tcp_notify(struct inpcb *inp, int error) { struct tcpcb *tp; INP_INFO_LOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(inp); if ((inp->inp_flags & INP_TIMEWAIT) || (inp->inp_flags & INP_DROPPED)) return (inp); tp = intotcpcb(inp); KASSERT(tp != NULL, ("tcp_notify: tp == NULL")); /* * Ignore some errors if we are hooked up. * If connection hasn't completed, has retransmitted several times, * and receives a second error, give up now. This is better * than waiting a long time to establish a connection that * can never complete. */ if (tp->t_state == TCPS_ESTABLISHED && (error == EHOSTUNREACH || error == ENETUNREACH || error == EHOSTDOWN)) { if (inp->inp_route.ro_nh) { NH_FREE(inp->inp_route.ro_nh); inp->inp_route.ro_nh = (struct nhop_object *)NULL; } return (inp); } else if (tp->t_state < TCPS_ESTABLISHED && tp->t_rxtshift > 3 && tp->t_softerror) { tp = tcp_drop(tp, error); if (tp != NULL) return (inp); else return (NULL); } else { tp->t_softerror = error; return (inp); } #if 0 wakeup( &so->so_timeo); sorwakeup(so); sowwakeup(so); #endif } static int tcp_pcblist(SYSCTL_HANDLER_ARGS) { struct epoch_tracker et; struct inpcb *inp; struct xinpgen xig; int error; if (req->newptr != NULL) return (EPERM); if (req->oldptr == NULL) { int n; n = V_tcbinfo.ipi_count + counter_u64_fetch(V_tcps_states[TCPS_SYN_RECEIVED]); n += imax(n / 8, 10); req->oldidx = 2 * (sizeof xig) + n * sizeof(struct xtcpcb); return (0); } if ((error = sysctl_wire_old_buffer(req, 0)) != 0) return (error); bzero(&xig, sizeof(xig)); xig.xig_len = sizeof xig; xig.xig_count = V_tcbinfo.ipi_count + counter_u64_fetch(V_tcps_states[TCPS_SYN_RECEIVED]); xig.xig_gen = V_tcbinfo.ipi_gencnt; xig.xig_sogen = so_gencnt; error = SYSCTL_OUT(req, &xig, sizeof xig); if (error) return (error); error = syncache_pcblist(req); if (error) return (error); NET_EPOCH_ENTER(et); for (inp = CK_LIST_FIRST(V_tcbinfo.ipi_listhead); inp != NULL; inp = CK_LIST_NEXT(inp, inp_list)) { INP_RLOCK(inp); if (inp->inp_gencnt <= xig.xig_gen) { int crerr; /* * XXX: This use of cr_cansee(), introduced with * TCP state changes, is not quite right, but for * now, better than nothing. */ if (inp->inp_flags & INP_TIMEWAIT) { if (intotw(inp) != NULL) crerr = cr_cansee(req->td->td_ucred, intotw(inp)->tw_cred); else crerr = EINVAL; /* Skip this inp. */ } else crerr = cr_canseeinpcb(req->td->td_ucred, inp); if (crerr == 0) { struct xtcpcb xt; tcp_inptoxtp(inp, &xt); INP_RUNLOCK(inp); error = SYSCTL_OUT(req, &xt, sizeof xt); if (error) break; else continue; } } INP_RUNLOCK(inp); } NET_EPOCH_EXIT(et); if (!error) { /* * Give the user an updated idea of our state. * If the generation differs from what we told * her before, she knows that something happened * while we were processing this request, and it * might be necessary to retry. */ xig.xig_gen = V_tcbinfo.ipi_gencnt; xig.xig_sogen = so_gencnt; xig.xig_count = V_tcbinfo.ipi_count + counter_u64_fetch(V_tcps_states[TCPS_SYN_RECEIVED]); error = SYSCTL_OUT(req, &xig, sizeof xig); } return (error); } SYSCTL_PROC(_net_inet_tcp, TCPCTL_PCBLIST, pcblist, CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_NEEDGIANT, NULL, 0, tcp_pcblist, "S,xtcpcb", "List of active TCP connections"); #ifdef INET static int tcp_getcred(SYSCTL_HANDLER_ARGS) { struct xucred xuc; struct sockaddr_in addrs[2]; struct epoch_tracker et; struct inpcb *inp; int error; error = priv_check(req->td, PRIV_NETINET_GETCRED); if (error) return (error); error = SYSCTL_IN(req, addrs, sizeof(addrs)); if (error) return (error); NET_EPOCH_ENTER(et); inp = in_pcblookup(&V_tcbinfo, addrs[1].sin_addr, addrs[1].sin_port, addrs[0].sin_addr, addrs[0].sin_port, INPLOOKUP_RLOCKPCB, NULL); NET_EPOCH_EXIT(et); if (inp != NULL) { if (inp->inp_socket == NULL) error = ENOENT; if (error == 0) error = cr_canseeinpcb(req->td->td_ucred, inp); if (error == 0) cru2x(inp->inp_cred, &xuc); INP_RUNLOCK(inp); } else error = ENOENT; if (error == 0) error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred)); return (error); } SYSCTL_PROC(_net_inet_tcp, OID_AUTO, getcred, CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_NEEDGIANT, 0, 0, tcp_getcred, "S,xucred", "Get the xucred of a TCP connection"); #endif /* INET */ #ifdef INET6 static int tcp6_getcred(SYSCTL_HANDLER_ARGS) { struct epoch_tracker et; struct xucred xuc; struct sockaddr_in6 addrs[2]; struct inpcb *inp; int error; #ifdef INET int mapped = 0; #endif error = priv_check(req->td, PRIV_NETINET_GETCRED); if (error) return (error); error = SYSCTL_IN(req, addrs, sizeof(addrs)); if (error) return (error); if ((error = sa6_embedscope(&addrs[0], V_ip6_use_defzone)) != 0 || (error = sa6_embedscope(&addrs[1], V_ip6_use_defzone)) != 0) { return (error); } if (IN6_IS_ADDR_V4MAPPED(&addrs[0].sin6_addr)) { #ifdef INET if (IN6_IS_ADDR_V4MAPPED(&addrs[1].sin6_addr)) mapped = 1; else #endif return (EINVAL); } NET_EPOCH_ENTER(et); #ifdef INET if (mapped == 1) inp = in_pcblookup(&V_tcbinfo, *(struct in_addr *)&addrs[1].sin6_addr.s6_addr[12], addrs[1].sin6_port, *(struct in_addr *)&addrs[0].sin6_addr.s6_addr[12], addrs[0].sin6_port, INPLOOKUP_RLOCKPCB, NULL); else #endif inp = in6_pcblookup(&V_tcbinfo, &addrs[1].sin6_addr, addrs[1].sin6_port, &addrs[0].sin6_addr, addrs[0].sin6_port, INPLOOKUP_RLOCKPCB, NULL); NET_EPOCH_EXIT(et); if (inp != NULL) { if (inp->inp_socket == NULL) error = ENOENT; if (error == 0) error = cr_canseeinpcb(req->td->td_ucred, inp); if (error == 0) cru2x(inp->inp_cred, &xuc); INP_RUNLOCK(inp); } else error = ENOENT; if (error == 0) error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred)); return (error); } SYSCTL_PROC(_net_inet6_tcp6, OID_AUTO, getcred, CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_NEEDGIANT, 0, 0, tcp6_getcred, "S,xucred", "Get the xucred of a TCP6 connection"); #endif /* INET6 */ #ifdef INET /* Path MTU to try next when a fragmentation-needed message is received. */ static inline int tcp_next_pmtu(const struct icmp *icp, const struct ip *ip) { int mtu = ntohs(icp->icmp_nextmtu); /* If no alternative MTU was proposed, try the next smaller one. */ if (!mtu) mtu = ip_next_mtu(ntohs(ip->ip_len), 1); if (mtu < V_tcp_minmss + sizeof(struct tcpiphdr)) mtu = V_tcp_minmss + sizeof(struct tcpiphdr); return (mtu); } static void tcp_ctlinput_with_port(int cmd, struct sockaddr *sa, void *vip, uint16_t port) { struct ip *ip = vip; struct tcphdr *th; struct in_addr faddr; struct inpcb *inp; struct tcpcb *tp; struct inpcb *(*notify)(struct inpcb *, int) = tcp_notify; struct icmp *icp; struct in_conninfo inc; tcp_seq icmp_tcp_seq; int mtu; faddr = ((struct sockaddr_in *)sa)->sin_addr; if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY) return; if (cmd == PRC_MSGSIZE) notify = tcp_mtudisc_notify; else if (V_icmp_may_rst && (cmd == PRC_UNREACH_ADMIN_PROHIB || cmd == PRC_UNREACH_PORT || cmd == PRC_UNREACH_PROTOCOL || cmd == PRC_TIMXCEED_INTRANS) && ip) notify = tcp_drop_syn_sent; /* * Hostdead is ugly because it goes linearly through all PCBs. * XXX: We never get this from ICMP, otherwise it makes an * excellent DoS attack on machines with many connections. */ else if (cmd == PRC_HOSTDEAD) ip = NULL; else if ((unsigned)cmd >= PRC_NCMDS || inetctlerrmap[cmd] == 0) return; if (ip == NULL) { in_pcbnotifyall(&V_tcbinfo, faddr, inetctlerrmap[cmd], notify); return; } icp = (struct icmp *)((caddr_t)ip - offsetof(struct icmp, icmp_ip)); th = (struct tcphdr *)((caddr_t)ip + (ip->ip_hl << 2)); inp = in_pcblookup(&V_tcbinfo, faddr, th->th_dport, ip->ip_src, th->th_sport, INPLOOKUP_WLOCKPCB, NULL); if (inp != NULL && PRC_IS_REDIRECT(cmd)) { /* signal EHOSTDOWN, as it flushes the cached route */ inp = (*notify)(inp, EHOSTDOWN); goto out; } icmp_tcp_seq = th->th_seq; if (inp != NULL) { if (!(inp->inp_flags & INP_TIMEWAIT) && !(inp->inp_flags & INP_DROPPED) && !(inp->inp_socket == NULL)) { tp = intotcpcb(inp); #ifdef TCP_OFFLOAD if (tp->t_flags & TF_TOE && cmd == PRC_MSGSIZE) { /* * MTU discovery for offloaded connections. Let * the TOE driver verify seq# and process it. */ mtu = tcp_next_pmtu(icp, ip); tcp_offload_pmtu_update(tp, icmp_tcp_seq, mtu); goto out; } #endif if (tp->t_port != port) { goto out; } if (SEQ_GEQ(ntohl(icmp_tcp_seq), tp->snd_una) && SEQ_LT(ntohl(icmp_tcp_seq), tp->snd_max)) { if (cmd == PRC_MSGSIZE) { /* * MTU discovery: we got a needfrag and * will potentially try a lower MTU. */ mtu = tcp_next_pmtu(icp, ip); /* * Only process the offered MTU if it * is smaller than the current one. */ if (mtu < tp->t_maxseg + sizeof(struct tcpiphdr)) { bzero(&inc, sizeof(inc)); inc.inc_faddr = faddr; inc.inc_fibnum = inp->inp_inc.inc_fibnum; tcp_hc_updatemtu(&inc, mtu); tcp_mtudisc(inp, mtu); } } else inp = (*notify)(inp, inetctlerrmap[cmd]); } } } else { bzero(&inc, sizeof(inc)); inc.inc_fport = th->th_dport; inc.inc_lport = th->th_sport; inc.inc_faddr = faddr; inc.inc_laddr = ip->ip_src; syncache_unreach(&inc, icmp_tcp_seq, port); } out: if (inp != NULL) INP_WUNLOCK(inp); } void tcp_ctlinput(int cmd, struct sockaddr *sa, void *vip) { tcp_ctlinput_with_port(cmd, sa, vip, htons(0)); } void tcp_ctlinput_viaudp(int cmd, struct sockaddr *sa, void *vip, void *unused) { /* Its a tunneled TCP over UDP icmp */ struct ip *outer_ip, *inner_ip; struct icmp *icmp; struct udphdr *udp; struct tcphdr *th, ttemp; int i_hlen, o_len; uint16_t port; inner_ip = (struct ip *)vip; icmp = (struct icmp *)((caddr_t)inner_ip - (sizeof(struct icmp) - sizeof(struct ip))); outer_ip = (struct ip *)((caddr_t)icmp - sizeof(struct ip)); i_hlen = inner_ip->ip_hl << 2; o_len = ntohs(outer_ip->ip_len); if (o_len < (sizeof(struct ip) + 8 + i_hlen + sizeof(struct udphdr) + offsetof(struct tcphdr, th_ack))) { /* Not enough data present */ return; } /* Ok lets strip out the inner udphdr header by copying up on top of it the tcp hdr */ udp = (struct udphdr *)(((caddr_t)inner_ip) + i_hlen); if (ntohs(udp->uh_sport) != V_tcp_udp_tunneling_port) { return; } port = udp->uh_dport; th = (struct tcphdr *)(udp + 1); memcpy(&ttemp, th, sizeof(struct tcphdr)); memcpy(udp, &ttemp, sizeof(struct tcphdr)); /* Now adjust down the size of the outer IP header */ o_len -= sizeof(struct udphdr); outer_ip->ip_len = htons(o_len); /* Now call in to the normal handling code */ tcp_ctlinput_with_port(cmd, sa, vip, port); } #endif /* INET */ #ifdef INET6 static inline int tcp6_next_pmtu(const struct icmp6_hdr *icmp6) { int mtu = ntohl(icmp6->icmp6_mtu); /* * If no alternative MTU was proposed, or the proposed MTU was too * small, set to the min. */ if (mtu < IPV6_MMTU) mtu = IPV6_MMTU - 8; /* XXXNP: what is the adjustment for? */ return (mtu); } static void tcp6_ctlinput_with_port(int cmd, struct sockaddr *sa, void *d, uint16_t port) { struct in6_addr *dst; struct inpcb *(*notify)(struct inpcb *, int) = tcp_notify; struct ip6_hdr *ip6; struct mbuf *m; struct inpcb *inp; struct tcpcb *tp; struct icmp6_hdr *icmp6; struct ip6ctlparam *ip6cp = NULL; const struct sockaddr_in6 *sa6_src = NULL; struct in_conninfo inc; struct tcp_ports { uint16_t th_sport; uint16_t th_dport; } t_ports; tcp_seq icmp_tcp_seq; unsigned int mtu; unsigned int off; if (sa->sa_family != AF_INET6 || sa->sa_len != sizeof(struct sockaddr_in6)) return; /* if the parameter is from icmp6, decode it. */ if (d != NULL) { ip6cp = (struct ip6ctlparam *)d; icmp6 = ip6cp->ip6c_icmp6; m = ip6cp->ip6c_m; ip6 = ip6cp->ip6c_ip6; off = ip6cp->ip6c_off; sa6_src = ip6cp->ip6c_src; dst = ip6cp->ip6c_finaldst; } else { m = NULL; ip6 = NULL; off = 0; /* fool gcc */ sa6_src = &sa6_any; dst = NULL; } if (cmd == PRC_MSGSIZE) notify = tcp_mtudisc_notify; else if (V_icmp_may_rst && (cmd == PRC_UNREACH_ADMIN_PROHIB || cmd == PRC_UNREACH_PORT || cmd == PRC_UNREACH_PROTOCOL || cmd == PRC_TIMXCEED_INTRANS) && ip6 != NULL) notify = tcp_drop_syn_sent; /* * Hostdead is ugly because it goes linearly through all PCBs. * XXX: We never get this from ICMP, otherwise it makes an * excellent DoS attack on machines with many connections. */ else if (cmd == PRC_HOSTDEAD) ip6 = NULL; else if ((unsigned)cmd >= PRC_NCMDS || inet6ctlerrmap[cmd] == 0) return; if (ip6 == NULL) { in6_pcbnotify(&V_tcbinfo, sa, 0, (const struct sockaddr *)sa6_src, 0, cmd, NULL, notify); return; } /* Check if we can safely get the ports from the tcp hdr */ if (m == NULL || (m->m_pkthdr.len < (int32_t) (off + sizeof(struct tcp_ports)))) { return; } bzero(&t_ports, sizeof(struct tcp_ports)); m_copydata(m, off, sizeof(struct tcp_ports), (caddr_t)&t_ports); inp = in6_pcblookup(&V_tcbinfo, &ip6->ip6_dst, t_ports.th_dport, &ip6->ip6_src, t_ports.th_sport, INPLOOKUP_WLOCKPCB, NULL); if (inp != NULL && PRC_IS_REDIRECT(cmd)) { /* signal EHOSTDOWN, as it flushes the cached route */ inp = (*notify)(inp, EHOSTDOWN); goto out; } off += sizeof(struct tcp_ports); if (m->m_pkthdr.len < (int32_t) (off + sizeof(tcp_seq))) { goto out; } m_copydata(m, off, sizeof(tcp_seq), (caddr_t)&icmp_tcp_seq); if (inp != NULL) { if (!(inp->inp_flags & INP_TIMEWAIT) && !(inp->inp_flags & INP_DROPPED) && !(inp->inp_socket == NULL)) { tp = intotcpcb(inp); #ifdef TCP_OFFLOAD if (tp->t_flags & TF_TOE && cmd == PRC_MSGSIZE) { /* MTU discovery for offloaded connections. */ mtu = tcp6_next_pmtu(icmp6); tcp_offload_pmtu_update(tp, icmp_tcp_seq, mtu); goto out; } #endif if (tp->t_port != port) { goto out; } if (SEQ_GEQ(ntohl(icmp_tcp_seq), tp->snd_una) && SEQ_LT(ntohl(icmp_tcp_seq), tp->snd_max)) { if (cmd == PRC_MSGSIZE) { /* * MTU discovery: * If we got a needfrag set the MTU * in the route to the suggested new * value (if given) and then notify. */ mtu = tcp6_next_pmtu(icmp6); bzero(&inc, sizeof(inc)); inc.inc_fibnum = M_GETFIB(m); inc.inc_flags |= INC_ISIPV6; inc.inc6_faddr = *dst; if (in6_setscope(&inc.inc6_faddr, m->m_pkthdr.rcvif, NULL)) goto out; /* * Only process the offered MTU if it * is smaller than the current one. */ if (mtu < tp->t_maxseg + sizeof (struct tcphdr) + sizeof (struct ip6_hdr)) { tcp_hc_updatemtu(&inc, mtu); tcp_mtudisc(inp, mtu); ICMP6STAT_INC(icp6s_pmtuchg); } } else inp = (*notify)(inp, inet6ctlerrmap[cmd]); } } } else { bzero(&inc, sizeof(inc)); inc.inc_fibnum = M_GETFIB(m); inc.inc_flags |= INC_ISIPV6; inc.inc_fport = t_ports.th_dport; inc.inc_lport = t_ports.th_sport; inc.inc6_faddr = *dst; inc.inc6_laddr = ip6->ip6_src; syncache_unreach(&inc, icmp_tcp_seq, port); } out: if (inp != NULL) INP_WUNLOCK(inp); } void tcp6_ctlinput(int cmd, struct sockaddr *sa, void *d) { tcp6_ctlinput_with_port(cmd, sa, d, htons(0)); } void tcp6_ctlinput_viaudp(int cmd, struct sockaddr *sa, void *d, void *unused) { struct ip6ctlparam *ip6cp; struct mbuf *m; struct udphdr *udp; uint16_t port; ip6cp = (struct ip6ctlparam *)d; m = m_pulldown(ip6cp->ip6c_m, ip6cp->ip6c_off, sizeof(struct udphdr), NULL); if (m == NULL) { return; } udp = mtod(m, struct udphdr *); if (ntohs(udp->uh_sport) != V_tcp_udp_tunneling_port) { return; } port = udp->uh_dport; m_adj(m, sizeof(struct udphdr)); if ((m->m_flags & M_PKTHDR) == 0) { ip6cp->ip6c_m->m_pkthdr.len -= sizeof(struct udphdr); } /* Now call in to the normal handling code */ tcp6_ctlinput_with_port(cmd, sa, d, port); } #endif /* INET6 */ static uint32_t tcp_keyed_hash(struct in_conninfo *inc, u_char *key, u_int len) { SIPHASH_CTX ctx; uint32_t hash[2]; KASSERT(len >= SIPHASH_KEY_LENGTH, ("%s: keylen %u too short ", __func__, len)); SipHash24_Init(&ctx); SipHash_SetKey(&ctx, (uint8_t *)key); SipHash_Update(&ctx, &inc->inc_fport, sizeof(uint16_t)); SipHash_Update(&ctx, &inc->inc_lport, sizeof(uint16_t)); switch (inc->inc_flags & INC_ISIPV6) { #ifdef INET case 0: SipHash_Update(&ctx, &inc->inc_faddr, sizeof(struct in_addr)); SipHash_Update(&ctx, &inc->inc_laddr, sizeof(struct in_addr)); break; #endif #ifdef INET6 case INC_ISIPV6: SipHash_Update(&ctx, &inc->inc6_faddr, sizeof(struct in6_addr)); SipHash_Update(&ctx, &inc->inc6_laddr, sizeof(struct in6_addr)); break; #endif } SipHash_Final((uint8_t *)hash, &ctx); return (hash[0] ^ hash[1]); } uint32_t tcp_new_ts_offset(struct in_conninfo *inc) { struct in_conninfo inc_store, *local_inc; if (!V_tcp_ts_offset_per_conn) { memcpy(&inc_store, inc, sizeof(struct in_conninfo)); inc_store.inc_lport = 0; inc_store.inc_fport = 0; local_inc = &inc_store; } else { local_inc = inc; } return (tcp_keyed_hash(local_inc, V_ts_offset_secret, sizeof(V_ts_offset_secret))); } /* * Following is where TCP initial sequence number generation occurs. * * There are two places where we must use initial sequence numbers: * 1. In SYN-ACK packets. * 2. In SYN packets. * * All ISNs for SYN-ACK packets are generated by the syncache. See * tcp_syncache.c for details. * * The ISNs in SYN packets must be monotonic; TIME_WAIT recycling * depends on this property. In addition, these ISNs should be * unguessable so as to prevent connection hijacking. To satisfy * the requirements of this situation, the algorithm outlined in * RFC 1948 is used, with only small modifications. * * Implementation details: * * Time is based off the system timer, and is corrected so that it * increases by one megabyte per second. This allows for proper * recycling on high speed LANs while still leaving over an hour * before rollover. * * As reading the *exact* system time is too expensive to be done * whenever setting up a TCP connection, we increment the time * offset in two ways. First, a small random positive increment * is added to isn_offset for each connection that is set up. * Second, the function tcp_isn_tick fires once per clock tick * and increments isn_offset as necessary so that sequence numbers * are incremented at approximately ISN_BYTES_PER_SECOND. The * random positive increments serve only to ensure that the same * exact sequence number is never sent out twice (as could otherwise * happen when a port is recycled in less than the system tick * interval.) * * net.inet.tcp.isn_reseed_interval controls the number of seconds * between seeding of isn_secret. This is normally set to zero, * as reseeding should not be necessary. * * Locking of the global variables isn_secret, isn_last_reseed, isn_offset, * isn_offset_old, and isn_ctx is performed using the ISN lock. In * general, this means holding an exclusive (write) lock. */ #define ISN_BYTES_PER_SECOND 1048576 #define ISN_STATIC_INCREMENT 4096 #define ISN_RANDOM_INCREMENT (4096 - 1) #define ISN_SECRET_LENGTH SIPHASH_KEY_LENGTH VNET_DEFINE_STATIC(u_char, isn_secret[ISN_SECRET_LENGTH]); VNET_DEFINE_STATIC(int, isn_last); VNET_DEFINE_STATIC(int, isn_last_reseed); VNET_DEFINE_STATIC(u_int32_t, isn_offset); VNET_DEFINE_STATIC(u_int32_t, isn_offset_old); #define V_isn_secret VNET(isn_secret) #define V_isn_last VNET(isn_last) #define V_isn_last_reseed VNET(isn_last_reseed) #define V_isn_offset VNET(isn_offset) #define V_isn_offset_old VNET(isn_offset_old) tcp_seq tcp_new_isn(struct in_conninfo *inc) { tcp_seq new_isn; u_int32_t projected_offset; ISN_LOCK(); /* Seed if this is the first use, reseed if requested. */ if ((V_isn_last_reseed == 0) || ((V_tcp_isn_reseed_interval > 0) && (((u_int)V_isn_last_reseed + (u_int)V_tcp_isn_reseed_interval*hz) < (u_int)ticks))) { arc4rand(&V_isn_secret, sizeof(V_isn_secret), 0); V_isn_last_reseed = ticks; } /* Compute the hash and return the ISN. */ new_isn = (tcp_seq)tcp_keyed_hash(inc, V_isn_secret, sizeof(V_isn_secret)); V_isn_offset += ISN_STATIC_INCREMENT + (arc4random() & ISN_RANDOM_INCREMENT); if (ticks != V_isn_last) { projected_offset = V_isn_offset_old + ISN_BYTES_PER_SECOND / hz * (ticks - V_isn_last); if (SEQ_GT(projected_offset, V_isn_offset)) V_isn_offset = projected_offset; V_isn_offset_old = V_isn_offset; V_isn_last = ticks; } new_isn += V_isn_offset; ISN_UNLOCK(); return (new_isn); } /* * When a specific ICMP unreachable message is received and the * connection state is SYN-SENT, drop the connection. This behavior * is controlled by the icmp_may_rst sysctl. */ struct inpcb * tcp_drop_syn_sent(struct inpcb *inp, int errno) { struct tcpcb *tp; NET_EPOCH_ASSERT(); INP_WLOCK_ASSERT(inp); if ((inp->inp_flags & INP_TIMEWAIT) || (inp->inp_flags & INP_DROPPED)) return (inp); tp = intotcpcb(inp); if (tp->t_state != TCPS_SYN_SENT) return (inp); if (IS_FASTOPEN(tp->t_flags)) tcp_fastopen_disable_path(tp); tp = tcp_drop(tp, errno); if (tp != NULL) return (inp); else return (NULL); } /* * When `need fragmentation' ICMP is received, update our idea of the MSS * based on the new value. Also nudge TCP to send something, since we * know the packet we just sent was dropped. * This duplicates some code in the tcp_mss() function in tcp_input.c. */ static struct inpcb * tcp_mtudisc_notify(struct inpcb *inp, int error) { tcp_mtudisc(inp, -1); return (inp); } static void tcp_mtudisc(struct inpcb *inp, int mtuoffer) { struct tcpcb *tp; struct socket *so; INP_WLOCK_ASSERT(inp); if ((inp->inp_flags & INP_TIMEWAIT) || (inp->inp_flags & INP_DROPPED)) return; tp = intotcpcb(inp); KASSERT(tp != NULL, ("tcp_mtudisc: tp == NULL")); tcp_mss_update(tp, -1, mtuoffer, NULL, NULL); so = inp->inp_socket; SOCKBUF_LOCK(&so->so_snd); /* If the mss is larger than the socket buffer, decrease the mss. */ if (so->so_snd.sb_hiwat < tp->t_maxseg) tp->t_maxseg = so->so_snd.sb_hiwat; SOCKBUF_UNLOCK(&so->so_snd); TCPSTAT_INC(tcps_mturesent); tp->t_rtttime = 0; tp->snd_nxt = tp->snd_una; tcp_free_sackholes(tp); tp->snd_recover = tp->snd_max; if (tp->t_flags & TF_SACK_PERMIT) EXIT_FASTRECOVERY(tp->t_flags); if (tp->t_fb->tfb_tcp_mtu_chg != NULL) { /* * Conceptually the snd_nxt setting * and freeing sack holes should * be done by the default stacks * own tfb_tcp_mtu_chg(). */ tp->t_fb->tfb_tcp_mtu_chg(tp); } tp->t_fb->tfb_tcp_output(tp); } #ifdef INET /* * Look-up the routing entry to the peer of this inpcb. If no route * is found and it cannot be allocated, then return 0. This routine * is called by TCP routines that access the rmx structure and by * tcp_mss_update to get the peer/interface MTU. */ uint32_t tcp_maxmtu(struct in_conninfo *inc, struct tcp_ifcap *cap) { struct nhop_object *nh; struct ifnet *ifp; uint32_t maxmtu = 0; KASSERT(inc != NULL, ("tcp_maxmtu with NULL in_conninfo pointer")); if (inc->inc_faddr.s_addr != INADDR_ANY) { nh = fib4_lookup(inc->inc_fibnum, inc->inc_faddr, 0, NHR_NONE, 0); if (nh == NULL) return (0); ifp = nh->nh_ifp; maxmtu = nh->nh_mtu; /* Report additional interface capabilities. */ if (cap != NULL) { if (ifp->if_capenable & IFCAP_TSO4 && ifp->if_hwassist & CSUM_TSO) { cap->ifcap |= CSUM_TSO; cap->tsomax = ifp->if_hw_tsomax; cap->tsomaxsegcount = ifp->if_hw_tsomaxsegcount; cap->tsomaxsegsize = ifp->if_hw_tsomaxsegsize; } } } return (maxmtu); } #endif /* INET */ #ifdef INET6 uint32_t tcp_maxmtu6(struct in_conninfo *inc, struct tcp_ifcap *cap) { struct nhop_object *nh; struct in6_addr dst6; uint32_t scopeid; struct ifnet *ifp; uint32_t maxmtu = 0; KASSERT(inc != NULL, ("tcp_maxmtu6 with NULL in_conninfo pointer")); if (inc->inc_flags & INC_IPV6MINMTU) return (IPV6_MMTU); if (!IN6_IS_ADDR_UNSPECIFIED(&inc->inc6_faddr)) { in6_splitscope(&inc->inc6_faddr, &dst6, &scopeid); nh = fib6_lookup(inc->inc_fibnum, &dst6, scopeid, NHR_NONE, 0); if (nh == NULL) return (0); ifp = nh->nh_ifp; maxmtu = nh->nh_mtu; /* Report additional interface capabilities. */ if (cap != NULL) { if (ifp->if_capenable & IFCAP_TSO6 && ifp->if_hwassist & CSUM_TSO) { cap->ifcap |= CSUM_TSO; cap->tsomax = ifp->if_hw_tsomax; cap->tsomaxsegcount = ifp->if_hw_tsomaxsegcount; cap->tsomaxsegsize = ifp->if_hw_tsomaxsegsize; } } } return (maxmtu); } #endif /* INET6 */ /* * Calculate effective SMSS per RFC5681 definition for a given TCP * connection at its current state, taking into account SACK and etc. */ u_int tcp_maxseg(const struct tcpcb *tp) { u_int optlen; if (tp->t_flags & TF_NOOPT) return (tp->t_maxseg); /* * Here we have a simplified code from tcp_addoptions(), * without a proper loop, and having most of paddings hardcoded. * We might make mistakes with padding here in some edge cases, * but this is harmless, since result of tcp_maxseg() is used * only in cwnd and ssthresh estimations. */ if (TCPS_HAVEESTABLISHED(tp->t_state)) { if (tp->t_flags & TF_RCVD_TSTMP) optlen = TCPOLEN_TSTAMP_APPA; else optlen = 0; #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) if (tp->t_flags & TF_SIGNATURE) optlen += PADTCPOLEN(TCPOLEN_SIGNATURE); #endif if ((tp->t_flags & TF_SACK_PERMIT) && tp->rcv_numsacks > 0) { optlen += TCPOLEN_SACKHDR; optlen += tp->rcv_numsacks * TCPOLEN_SACK; optlen = PADTCPOLEN(optlen); } } else { if (tp->t_flags & TF_REQ_TSTMP) optlen = TCPOLEN_TSTAMP_APPA; else optlen = PADTCPOLEN(TCPOLEN_MAXSEG); if (tp->t_flags & TF_REQ_SCALE) optlen += PADTCPOLEN(TCPOLEN_WINDOW); #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) if (tp->t_flags & TF_SIGNATURE) optlen += PADTCPOLEN(TCPOLEN_SIGNATURE); #endif if (tp->t_flags & TF_SACK_PERMIT) optlen += PADTCPOLEN(TCPOLEN_SACK_PERMITTED); } #undef PAD optlen = min(optlen, TCP_MAXOLEN); return (tp->t_maxseg - optlen); } u_int tcp_fixed_maxseg(const struct tcpcb *tp) { int optlen; if (tp->t_flags & TF_NOOPT) return (tp->t_maxseg); /* * Here we have a simplified code from tcp_addoptions(), * without a proper loop, and having most of paddings hardcoded. * We only consider fixed options that we would send every * time I.e. SACK is not considered. This is important * for cc modules to figure out what the modulo of the * cwnd should be. */ #define PAD(len) ((((len) / 4) + !!((len) % 4)) * 4) if (TCPS_HAVEESTABLISHED(tp->t_state)) { if (tp->t_flags & TF_RCVD_TSTMP) optlen = TCPOLEN_TSTAMP_APPA; else optlen = 0; #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) if (tp->t_flags & TF_SIGNATURE) optlen += PAD(TCPOLEN_SIGNATURE); #endif } else { if (tp->t_flags & TF_REQ_TSTMP) optlen = TCPOLEN_TSTAMP_APPA; else optlen = PAD(TCPOLEN_MAXSEG); if (tp->t_flags & TF_REQ_SCALE) optlen += PAD(TCPOLEN_WINDOW); #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) if (tp->t_flags & TF_SIGNATURE) optlen += PAD(TCPOLEN_SIGNATURE); #endif if (tp->t_flags & TF_SACK_PERMIT) optlen += PAD(TCPOLEN_SACK_PERMITTED); } #undef PAD optlen = min(optlen, TCP_MAXOLEN); return (tp->t_maxseg - optlen); } static int sysctl_drop(SYSCTL_HANDLER_ARGS) { /* addrs[0] is a foreign socket, addrs[1] is a local one. */ struct sockaddr_storage addrs[2]; struct inpcb *inp; struct tcpcb *tp; struct tcptw *tw; struct sockaddr_in *fin, *lin; struct epoch_tracker et; #ifdef INET6 struct sockaddr_in6 *fin6, *lin6; #endif int error; inp = NULL; fin = lin = NULL; #ifdef INET6 fin6 = lin6 = NULL; #endif error = 0; if (req->oldptr != NULL || req->oldlen != 0) return (EINVAL); if (req->newptr == NULL) return (EPERM); if (req->newlen < sizeof(addrs)) return (ENOMEM); error = SYSCTL_IN(req, &addrs, sizeof(addrs)); if (error) return (error); switch (addrs[0].ss_family) { #ifdef INET6 case AF_INET6: fin6 = (struct sockaddr_in6 *)&addrs[0]; lin6 = (struct sockaddr_in6 *)&addrs[1]; if (fin6->sin6_len != sizeof(struct sockaddr_in6) || lin6->sin6_len != sizeof(struct sockaddr_in6)) return (EINVAL); if (IN6_IS_ADDR_V4MAPPED(&fin6->sin6_addr)) { if (!IN6_IS_ADDR_V4MAPPED(&lin6->sin6_addr)) return (EINVAL); in6_sin6_2_sin_in_sock((struct sockaddr *)&addrs[0]); in6_sin6_2_sin_in_sock((struct sockaddr *)&addrs[1]); fin = (struct sockaddr_in *)&addrs[0]; lin = (struct sockaddr_in *)&addrs[1]; break; } error = sa6_embedscope(fin6, V_ip6_use_defzone); if (error) return (error); error = sa6_embedscope(lin6, V_ip6_use_defzone); if (error) return (error); break; #endif #ifdef INET case AF_INET: fin = (struct sockaddr_in *)&addrs[0]; lin = (struct sockaddr_in *)&addrs[1]; if (fin->sin_len != sizeof(struct sockaddr_in) || lin->sin_len != sizeof(struct sockaddr_in)) return (EINVAL); break; #endif default: return (EINVAL); } NET_EPOCH_ENTER(et); switch (addrs[0].ss_family) { #ifdef INET6 case AF_INET6: inp = in6_pcblookup(&V_tcbinfo, &fin6->sin6_addr, fin6->sin6_port, &lin6->sin6_addr, lin6->sin6_port, INPLOOKUP_WLOCKPCB, NULL); break; #endif #ifdef INET case AF_INET: inp = in_pcblookup(&V_tcbinfo, fin->sin_addr, fin->sin_port, lin->sin_addr, lin->sin_port, INPLOOKUP_WLOCKPCB, NULL); break; #endif } if (inp != NULL) { if (inp->inp_flags & INP_TIMEWAIT) { /* * XXXRW: There currently exists a state where an * inpcb is present, but its timewait state has been * discarded. For now, don't allow dropping of this * type of inpcb. */ tw = intotw(inp); if (tw != NULL) tcp_twclose(tw, 0); else INP_WUNLOCK(inp); - } else if (!(inp->inp_flags & INP_DROPPED) && - !(inp->inp_socket->so_options & SO_ACCEPTCONN)) { + } else if ((inp->inp_flags & INP_DROPPED) == 0 && + !SOLISTENING(inp->inp_socket)) { tp = intotcpcb(inp); tp = tcp_drop(tp, ECONNABORTED); if (tp != NULL) INP_WUNLOCK(inp); } else INP_WUNLOCK(inp); } else error = ESRCH; NET_EPOCH_EXIT(et); return (error); } SYSCTL_PROC(_net_inet_tcp, TCPCTL_DROP, drop, CTLFLAG_VNET | CTLTYPE_STRUCT | CTLFLAG_WR | CTLFLAG_SKIP | CTLFLAG_NEEDGIANT, NULL, 0, sysctl_drop, "", "Drop TCP connection"); #ifdef KERN_TLS static int sysctl_switch_tls(SYSCTL_HANDLER_ARGS) { /* addrs[0] is a foreign socket, addrs[1] is a local one. */ struct sockaddr_storage addrs[2]; struct inpcb *inp; struct sockaddr_in *fin, *lin; struct epoch_tracker et; #ifdef INET6 struct sockaddr_in6 *fin6, *lin6; #endif int error; inp = NULL; fin = lin = NULL; #ifdef INET6 fin6 = lin6 = NULL; #endif error = 0; if (req->oldptr != NULL || req->oldlen != 0) return (EINVAL); if (req->newptr == NULL) return (EPERM); if (req->newlen < sizeof(addrs)) return (ENOMEM); error = SYSCTL_IN(req, &addrs, sizeof(addrs)); if (error) return (error); switch (addrs[0].ss_family) { #ifdef INET6 case AF_INET6: fin6 = (struct sockaddr_in6 *)&addrs[0]; lin6 = (struct sockaddr_in6 *)&addrs[1]; if (fin6->sin6_len != sizeof(struct sockaddr_in6) || lin6->sin6_len != sizeof(struct sockaddr_in6)) return (EINVAL); if (IN6_IS_ADDR_V4MAPPED(&fin6->sin6_addr)) { if (!IN6_IS_ADDR_V4MAPPED(&lin6->sin6_addr)) return (EINVAL); in6_sin6_2_sin_in_sock((struct sockaddr *)&addrs[0]); in6_sin6_2_sin_in_sock((struct sockaddr *)&addrs[1]); fin = (struct sockaddr_in *)&addrs[0]; lin = (struct sockaddr_in *)&addrs[1]; break; } error = sa6_embedscope(fin6, V_ip6_use_defzone); if (error) return (error); error = sa6_embedscope(lin6, V_ip6_use_defzone); if (error) return (error); break; #endif #ifdef INET case AF_INET: fin = (struct sockaddr_in *)&addrs[0]; lin = (struct sockaddr_in *)&addrs[1]; if (fin->sin_len != sizeof(struct sockaddr_in) || lin->sin_len != sizeof(struct sockaddr_in)) return (EINVAL); break; #endif default: return (EINVAL); } NET_EPOCH_ENTER(et); switch (addrs[0].ss_family) { #ifdef INET6 case AF_INET6: inp = in6_pcblookup(&V_tcbinfo, &fin6->sin6_addr, fin6->sin6_port, &lin6->sin6_addr, lin6->sin6_port, INPLOOKUP_WLOCKPCB, NULL); break; #endif #ifdef INET case AF_INET: inp = in_pcblookup(&V_tcbinfo, fin->sin_addr, fin->sin_port, lin->sin_addr, lin->sin_port, INPLOOKUP_WLOCKPCB, NULL); break; #endif } NET_EPOCH_EXIT(et); if (inp != NULL) { if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) != 0 || inp->inp_socket == NULL) { error = ECONNRESET; INP_WUNLOCK(inp); } else { struct socket *so; so = inp->inp_socket; soref(so); error = ktls_set_tx_mode(so, arg2 == 0 ? TCP_TLS_MODE_SW : TCP_TLS_MODE_IFNET); INP_WUNLOCK(inp); SOCK_LOCK(so); sorele(so); } } else error = ESRCH; return (error); } SYSCTL_PROC(_net_inet_tcp, OID_AUTO, switch_to_sw_tls, CTLFLAG_VNET | CTLTYPE_STRUCT | CTLFLAG_WR | CTLFLAG_SKIP | CTLFLAG_NEEDGIANT, NULL, 0, sysctl_switch_tls, "", "Switch TCP connection to SW TLS"); SYSCTL_PROC(_net_inet_tcp, OID_AUTO, switch_to_ifnet_tls, CTLFLAG_VNET | CTLTYPE_STRUCT | CTLFLAG_WR | CTLFLAG_SKIP | CTLFLAG_NEEDGIANT, NULL, 1, sysctl_switch_tls, "", "Switch TCP connection to ifnet TLS"); #endif /* * Generate a standardized TCP log line for use throughout the * tcp subsystem. Memory allocation is done with M_NOWAIT to * allow use in the interrupt context. * * NB: The caller MUST free(s, M_TCPLOG) the returned string. * NB: The function may return NULL if memory allocation failed. * * Due to header inclusion and ordering limitations the struct ip * and ip6_hdr pointers have to be passed as void pointers. */ char * tcp_log_vain(struct in_conninfo *inc, struct tcphdr *th, void *ip4hdr, const void *ip6hdr) { /* Is logging enabled? */ if (V_tcp_log_in_vain == 0) return (NULL); return (tcp_log_addr(inc, th, ip4hdr, ip6hdr)); } char * tcp_log_addrs(struct in_conninfo *inc, struct tcphdr *th, void *ip4hdr, const void *ip6hdr) { /* Is logging enabled? */ if (tcp_log_debug == 0) return (NULL); return (tcp_log_addr(inc, th, ip4hdr, ip6hdr)); } static char * tcp_log_addr(struct in_conninfo *inc, struct tcphdr *th, void *ip4hdr, const void *ip6hdr) { char *s, *sp; size_t size; struct ip *ip; #ifdef INET6 const struct ip6_hdr *ip6; ip6 = (const struct ip6_hdr *)ip6hdr; #endif /* INET6 */ ip = (struct ip *)ip4hdr; /* * The log line looks like this: * "TCP: [1.2.3.4]:50332 to [1.2.3.4]:80 tcpflags 0x2" */ size = sizeof("TCP: []:12345 to []:12345 tcpflags 0x2<>") + sizeof(PRINT_TH_FLAGS) + 1 + #ifdef INET6 2 * INET6_ADDRSTRLEN; #else 2 * INET_ADDRSTRLEN; #endif /* INET6 */ s = malloc(size, M_TCPLOG, M_ZERO|M_NOWAIT); if (s == NULL) return (NULL); strcat(s, "TCP: ["); sp = s + strlen(s); if (inc && ((inc->inc_flags & INC_ISIPV6) == 0)) { inet_ntoa_r(inc->inc_faddr, sp); sp = s + strlen(s); sprintf(sp, "]:%i to [", ntohs(inc->inc_fport)); sp = s + strlen(s); inet_ntoa_r(inc->inc_laddr, sp); sp = s + strlen(s); sprintf(sp, "]:%i", ntohs(inc->inc_lport)); #ifdef INET6 } else if (inc) { ip6_sprintf(sp, &inc->inc6_faddr); sp = s + strlen(s); sprintf(sp, "]:%i to [", ntohs(inc->inc_fport)); sp = s + strlen(s); ip6_sprintf(sp, &inc->inc6_laddr); sp = s + strlen(s); sprintf(sp, "]:%i", ntohs(inc->inc_lport)); } else if (ip6 && th) { ip6_sprintf(sp, &ip6->ip6_src); sp = s + strlen(s); sprintf(sp, "]:%i to [", ntohs(th->th_sport)); sp = s + strlen(s); ip6_sprintf(sp, &ip6->ip6_dst); sp = s + strlen(s); sprintf(sp, "]:%i", ntohs(th->th_dport)); #endif /* INET6 */ #ifdef INET } else if (ip && th) { inet_ntoa_r(ip->ip_src, sp); sp = s + strlen(s); sprintf(sp, "]:%i to [", ntohs(th->th_sport)); sp = s + strlen(s); inet_ntoa_r(ip->ip_dst, sp); sp = s + strlen(s); sprintf(sp, "]:%i", ntohs(th->th_dport)); #endif /* INET */ } else { free(s, M_TCPLOG); return (NULL); } sp = s + strlen(s); if (th) sprintf(sp, " tcpflags 0x%b", th->th_flags, PRINT_TH_FLAGS); if (*(s + size - 1) != '\0') panic("%s: string too long", __func__); return (s); } /* * A subroutine which makes it easy to track TCP state changes with DTrace. * This function shouldn't be called for t_state initializations that don't * correspond to actual TCP state transitions. */ void tcp_state_change(struct tcpcb *tp, int newstate) { #if defined(KDTRACE_HOOKS) int pstate = tp->t_state; #endif TCPSTATES_DEC(tp->t_state); TCPSTATES_INC(newstate); tp->t_state = newstate; TCP_PROBE6(state__change, NULL, tp, NULL, tp, NULL, pstate); } /* * Create an external-format (``xtcpcb'') structure using the information in * the kernel-format tcpcb structure pointed to by tp. This is done to * reduce the spew of irrelevant information over this interface, to isolate * user code from changes in the kernel structure, and potentially to provide * information-hiding if we decide that some of this information should be * hidden from users. */ void tcp_inptoxtp(const struct inpcb *inp, struct xtcpcb *xt) { struct tcpcb *tp = intotcpcb(inp); struct tcptw *tw = intotw(inp); sbintime_t now; bzero(xt, sizeof(*xt)); if (inp->inp_flags & INP_TIMEWAIT) { xt->t_state = TCPS_TIME_WAIT; xt->xt_encaps_port = tw->t_port; } else { xt->t_state = tp->t_state; xt->t_logstate = tp->t_logstate; xt->t_flags = tp->t_flags; xt->t_sndzerowin = tp->t_sndzerowin; xt->t_sndrexmitpack = tp->t_sndrexmitpack; xt->t_rcvoopack = tp->t_rcvoopack; xt->t_rcv_wnd = tp->rcv_wnd; xt->t_snd_wnd = tp->snd_wnd; xt->t_snd_cwnd = tp->snd_cwnd; xt->t_snd_ssthresh = tp->snd_ssthresh; xt->t_maxseg = tp->t_maxseg; xt->xt_ecn = (tp->t_flags2 & TF2_ECN_PERMIT) ? 1 : 0 + (tp->t_flags2 & TF2_ACE_PERMIT) ? 2 : 0; now = getsbinuptime(); #define COPYTIMER(ttt) do { \ if (callout_active(&tp->t_timers->ttt)) \ xt->ttt = (tp->t_timers->ttt.c_time - now) / \ SBT_1MS; \ else \ xt->ttt = 0; \ } while (0) COPYTIMER(tt_delack); COPYTIMER(tt_rexmt); COPYTIMER(tt_persist); COPYTIMER(tt_keep); COPYTIMER(tt_2msl); #undef COPYTIMER xt->t_rcvtime = 1000 * (ticks - tp->t_rcvtime) / hz; xt->xt_encaps_port = tp->t_port; bcopy(tp->t_fb->tfb_tcp_block_name, xt->xt_stack, TCP_FUNCTION_NAME_LEN_MAX); bcopy(CC_ALGO(tp)->name, xt->xt_cc, TCP_CA_NAME_MAX); #ifdef TCP_BLACKBOX (void)tcp_log_get_id(tp, xt->xt_logid); #endif } xt->xt_len = sizeof(struct xtcpcb); in_pcbtoxinpcb(inp, &xt->xt_inp); if (inp->inp_socket == NULL) xt->xt_inp.xi_socket.xso_protocol = IPPROTO_TCP; } void tcp_log_end_status(struct tcpcb *tp, uint8_t status) { uint32_t bit, i; if ((tp == NULL) || (status > TCP_EI_STATUS_MAX_VALUE) || (status == 0)) { /* Invalid */ return; } if (status > (sizeof(uint32_t) * 8)) { /* Should this be a KASSERT? */ return; } bit = 1U << (status - 1); if (bit & tp->t_end_info_status) { /* already logged */ return; } for (i = 0; i < TCP_END_BYTE_INFO; i++) { if (tp->t_end_info_bytes[i] == TCP_EI_EMPTY_SLOT) { tp->t_end_info_bytes[i] = status; tp->t_end_info_status |= bit; break; } } } int tcp_can_enable_pacing(void) { if ((tcp_pacing_limit == -1) || (tcp_pacing_limit > number_of_tcp_connections_pacing)) { atomic_fetchadd_int(&number_of_tcp_connections_pacing, 1); shadow_num_connections = number_of_tcp_connections_pacing; return (1); } else { return (0); } } static uint8_t tcp_pacing_warning = 0; void tcp_decrement_paced_conn(void) { uint32_t ret; ret = atomic_fetchadd_int(&number_of_tcp_connections_pacing, -1); shadow_num_connections = number_of_tcp_connections_pacing; KASSERT(ret != 0, ("tcp_paced_connection_exits -1 would cause wrap?")); if (ret == 0) { if (tcp_pacing_limit != -1) { printf("Warning all pacing is now disabled, count decrements invalidly!\n"); tcp_pacing_limit = 0; } else if (tcp_pacing_warning == 0) { printf("Warning pacing count is invalid, invalid decrement\n"); tcp_pacing_warning = 1; } } } diff --git a/sys/rpc/svc_vc.c b/sys/rpc/svc_vc.c index de1baa1417b1..234feba5c8bd 100644 --- a/sys/rpc/svc_vc.c +++ b/sys/rpc/svc_vc.c @@ -1,1130 +1,1130 @@ /* $NetBSD: svc_vc.c,v 1.7 2000/08/03 00:01:53 fvdl Exp $ */ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2009, Sun Microsystems, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * - Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * - Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - Neither the name of Sun Microsystems, Inc. nor the names of its * contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #if defined(LIBC_SCCS) && !defined(lint) static char *sccsid2 = "@(#)svc_tcp.c 1.21 87/08/11 Copyr 1984 Sun Micro"; static char *sccsid = "@(#)svc_tcp.c 2.2 88/08/01 4.0 RPCSRC"; #endif #include __FBSDID("$FreeBSD$"); /* * svc_vc.c, Server side for Connection Oriented based RPC. * * Actually implements two flavors of transporter - * a tcp rendezvouser (a listner and connection establisher) * and a record/tcp stream. */ #include "opt_kern_tls.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static bool_t svc_vc_rendezvous_recv(SVCXPRT *, struct rpc_msg *, struct sockaddr **, struct mbuf **); static enum xprt_stat svc_vc_rendezvous_stat(SVCXPRT *); static void svc_vc_rendezvous_destroy(SVCXPRT *); static bool_t svc_vc_null(void); static void svc_vc_destroy(SVCXPRT *); static enum xprt_stat svc_vc_stat(SVCXPRT *); static bool_t svc_vc_ack(SVCXPRT *, uint32_t *); static bool_t svc_vc_recv(SVCXPRT *, struct rpc_msg *, struct sockaddr **, struct mbuf **); static bool_t svc_vc_reply(SVCXPRT *, struct rpc_msg *, struct sockaddr *, struct mbuf *, uint32_t *seq); static bool_t svc_vc_control(SVCXPRT *xprt, const u_int rq, void *in); static bool_t svc_vc_rendezvous_control (SVCXPRT *xprt, const u_int rq, void *in); static void svc_vc_backchannel_destroy(SVCXPRT *); static enum xprt_stat svc_vc_backchannel_stat(SVCXPRT *); static bool_t svc_vc_backchannel_recv(SVCXPRT *, struct rpc_msg *, struct sockaddr **, struct mbuf **); static bool_t svc_vc_backchannel_reply(SVCXPRT *, struct rpc_msg *, struct sockaddr *, struct mbuf *, uint32_t *); static bool_t svc_vc_backchannel_control(SVCXPRT *xprt, const u_int rq, void *in); static SVCXPRT *svc_vc_create_conn(SVCPOOL *pool, struct socket *so, struct sockaddr *raddr); static int svc_vc_accept(struct socket *head, struct socket **sop); static int svc_vc_soupcall(struct socket *so, void *arg, int waitflag); static int svc_vc_rendezvous_soupcall(struct socket *, void *, int); static struct xp_ops svc_vc_rendezvous_ops = { .xp_recv = svc_vc_rendezvous_recv, .xp_stat = svc_vc_rendezvous_stat, .xp_reply = (bool_t (*)(SVCXPRT *, struct rpc_msg *, struct sockaddr *, struct mbuf *, uint32_t *))svc_vc_null, .xp_destroy = svc_vc_rendezvous_destroy, .xp_control = svc_vc_rendezvous_control }; static struct xp_ops svc_vc_ops = { .xp_recv = svc_vc_recv, .xp_stat = svc_vc_stat, .xp_ack = svc_vc_ack, .xp_reply = svc_vc_reply, .xp_destroy = svc_vc_destroy, .xp_control = svc_vc_control }; static struct xp_ops svc_vc_backchannel_ops = { .xp_recv = svc_vc_backchannel_recv, .xp_stat = svc_vc_backchannel_stat, .xp_reply = svc_vc_backchannel_reply, .xp_destroy = svc_vc_backchannel_destroy, .xp_control = svc_vc_backchannel_control }; /* * Usage: * xprt = svc_vc_create(sock, send_buf_size, recv_buf_size); * * Creates, registers, and returns a (rpc) tcp based transporter. * Once *xprt is initialized, it is registered as a transporter * see (svc.h, xprt_register). This routine returns * a NULL if a problem occurred. * * The filedescriptor passed in is expected to refer to a bound, but * not yet connected socket. * * Since streams do buffered io similar to stdio, the caller can specify * how big the send and receive buffers are via the second and third parms; * 0 => use the system default. */ SVCXPRT * svc_vc_create(SVCPOOL *pool, struct socket *so, size_t sendsize, size_t recvsize) { SVCXPRT *xprt; struct sockaddr* sa; int error; SOCK_LOCK(so); if (so->so_state & (SS_ISCONNECTED|SS_ISDISCONNECTED)) { SOCK_UNLOCK(so); CURVNET_SET(so->so_vnet); error = so->so_proto->pr_usrreqs->pru_peeraddr(so, &sa); CURVNET_RESTORE(); if (error) return (NULL); xprt = svc_vc_create_conn(pool, so, sa); free(sa, M_SONAME); return (xprt); } SOCK_UNLOCK(so); xprt = svc_xprt_alloc(); sx_init(&xprt->xp_lock, "xprt->xp_lock"); xprt->xp_pool = pool; xprt->xp_socket = so; xprt->xp_p1 = NULL; xprt->xp_p2 = NULL; xprt->xp_ops = &svc_vc_rendezvous_ops; CURVNET_SET(so->so_vnet); error = so->so_proto->pr_usrreqs->pru_sockaddr(so, &sa); CURVNET_RESTORE(); if (error) { goto cleanup_svc_vc_create; } memcpy(&xprt->xp_ltaddr, sa, sa->sa_len); free(sa, M_SONAME); xprt_register(xprt); solisten(so, -1, curthread); SOLISTEN_LOCK(so); xprt->xp_upcallset = 1; solisten_upcall_set(so, svc_vc_rendezvous_soupcall, xprt); SOLISTEN_UNLOCK(so); return (xprt); cleanup_svc_vc_create: sx_destroy(&xprt->xp_lock); svc_xprt_free(xprt); return (NULL); } /* * Create a new transport for a socket optained via soaccept(). */ SVCXPRT * svc_vc_create_conn(SVCPOOL *pool, struct socket *so, struct sockaddr *raddr) { SVCXPRT *xprt; struct cf_conn *cd; struct sockaddr* sa = NULL; struct sockopt opt; int one = 1; int error; bzero(&opt, sizeof(struct sockopt)); opt.sopt_dir = SOPT_SET; opt.sopt_level = SOL_SOCKET; opt.sopt_name = SO_KEEPALIVE; opt.sopt_val = &one; opt.sopt_valsize = sizeof(one); error = sosetopt(so, &opt); if (error) { return (NULL); } if (so->so_proto->pr_protocol == IPPROTO_TCP) { bzero(&opt, sizeof(struct sockopt)); opt.sopt_dir = SOPT_SET; opt.sopt_level = IPPROTO_TCP; opt.sopt_name = TCP_NODELAY; opt.sopt_val = &one; opt.sopt_valsize = sizeof(one); error = sosetopt(so, &opt); if (error) { return (NULL); } } cd = mem_alloc(sizeof(*cd)); cd->strm_stat = XPRT_IDLE; xprt = svc_xprt_alloc(); sx_init(&xprt->xp_lock, "xprt->xp_lock"); xprt->xp_pool = pool; xprt->xp_socket = so; xprt->xp_p1 = cd; xprt->xp_p2 = NULL; xprt->xp_ops = &svc_vc_ops; /* * See http://www.connectathon.org/talks96/nfstcp.pdf - client * has a 5 minute timer, server has a 6 minute timer. */ xprt->xp_idletimeout = 6 * 60; memcpy(&xprt->xp_rtaddr, raddr, raddr->sa_len); CURVNET_SET(so->so_vnet); error = so->so_proto->pr_usrreqs->pru_sockaddr(so, &sa); CURVNET_RESTORE(); if (error) goto cleanup_svc_vc_create; memcpy(&xprt->xp_ltaddr, sa, sa->sa_len); free(sa, M_SONAME); xprt_register(xprt); SOCKBUF_LOCK(&so->so_rcv); xprt->xp_upcallset = 1; soupcall_set(so, SO_RCV, svc_vc_soupcall, xprt); SOCKBUF_UNLOCK(&so->so_rcv); /* * Throw the transport into the active list in case it already * has some data buffered. */ sx_xlock(&xprt->xp_lock); xprt_active(xprt); sx_xunlock(&xprt->xp_lock); return (xprt); cleanup_svc_vc_create: sx_destroy(&xprt->xp_lock); svc_xprt_free(xprt); mem_free(cd, sizeof(*cd)); return (NULL); } /* * Create a new transport for a backchannel on a clnt_vc socket. */ SVCXPRT * svc_vc_create_backchannel(SVCPOOL *pool) { SVCXPRT *xprt = NULL; struct cf_conn *cd = NULL; cd = mem_alloc(sizeof(*cd)); cd->strm_stat = XPRT_IDLE; xprt = svc_xprt_alloc(); sx_init(&xprt->xp_lock, "xprt->xp_lock"); xprt->xp_pool = pool; xprt->xp_socket = NULL; xprt->xp_p1 = cd; xprt->xp_p2 = NULL; xprt->xp_ops = &svc_vc_backchannel_ops; return (xprt); } /* * This does all of the accept except the final call to soaccept. The * caller will call soaccept after dropping its locks (soaccept may * call malloc). */ int svc_vc_accept(struct socket *head, struct socket **sop) { struct socket *so; int error = 0; short nbio; /* XXXGL: shouldn't that be an assertion? */ - if ((head->so_options & SO_ACCEPTCONN) == 0) { + if (!SOLISTENING(head)) { error = EINVAL; goto done; } #ifdef MAC error = mac_socket_check_accept(curthread->td_ucred, head); if (error != 0) goto done; #endif /* * XXXGL: we want non-blocking semantics. The socket could be a * socket created by kernel as well as socket shared with userland, * so we can't be sure about presense of SS_NBIO. We also shall not * toggle it on the socket, since that may surprise userland. So we * set SS_NBIO only temporarily. */ SOLISTEN_LOCK(head); nbio = head->so_state & SS_NBIO; head->so_state |= SS_NBIO; error = solisten_dequeue(head, &so, 0); head->so_state &= (nbio & ~SS_NBIO); if (error) goto done; so->so_state |= nbio; *sop = so; /* connection has been removed from the listen queue */ KNOTE_UNLOCKED(&head->so_rdsel.si_note, 0); done: return (error); } /*ARGSUSED*/ static bool_t svc_vc_rendezvous_recv(SVCXPRT *xprt, struct rpc_msg *msg, struct sockaddr **addrp, struct mbuf **mp) { struct socket *so = NULL; struct sockaddr *sa = NULL; int error; SVCXPRT *new_xprt; /* * The socket upcall calls xprt_active() which will eventually * cause the server to call us here. We attempt to accept a * connection from the socket and turn it into a new * transport. If the accept fails, we have drained all pending * connections so we call xprt_inactive(). */ sx_xlock(&xprt->xp_lock); error = svc_vc_accept(xprt->xp_socket, &so); if (error == EWOULDBLOCK) { /* * We must re-test for new connections after taking * the lock to protect us in the case where a new * connection arrives after our call to accept fails * with EWOULDBLOCK. */ SOLISTEN_LOCK(xprt->xp_socket); if (TAILQ_EMPTY(&xprt->xp_socket->sol_comp)) xprt_inactive_self(xprt); SOLISTEN_UNLOCK(xprt->xp_socket); sx_xunlock(&xprt->xp_lock); return (FALSE); } if (error) { SOLISTEN_LOCK(xprt->xp_socket); if (xprt->xp_upcallset) { xprt->xp_upcallset = 0; soupcall_clear(xprt->xp_socket, SO_RCV); } SOLISTEN_UNLOCK(xprt->xp_socket); xprt_inactive_self(xprt); sx_xunlock(&xprt->xp_lock); return (FALSE); } sx_xunlock(&xprt->xp_lock); sa = NULL; error = soaccept(so, &sa); if (error) { /* * XXX not sure if I need to call sofree or soclose here. */ if (sa) free(sa, M_SONAME); return (FALSE); } /* * svc_vc_create_conn will call xprt_register - we don't need * to do anything with the new connection except derefence it. */ new_xprt = svc_vc_create_conn(xprt->xp_pool, so, sa); if (!new_xprt) { soclose(so); } else { SVC_RELEASE(new_xprt); } free(sa, M_SONAME); return (FALSE); /* there is never an rpc msg to be processed */ } /*ARGSUSED*/ static enum xprt_stat svc_vc_rendezvous_stat(SVCXPRT *xprt) { return (XPRT_IDLE); } static void svc_vc_destroy_common(SVCXPRT *xprt) { enum clnt_stat stat; uint32_t reterr; if (xprt->xp_socket) { if ((xprt->xp_tls & (RPCTLS_FLAGS_HANDSHAKE | RPCTLS_FLAGS_HANDSHFAIL)) != 0) { if ((xprt->xp_tls & RPCTLS_FLAGS_HANDSHAKE) != 0) { /* * If the upcall fails, the socket has * probably been closed via the rpctlssd * daemon having crashed or been * restarted, so just ignore returned stat. */ stat = rpctls_srv_disconnect(xprt->xp_sslsec, xprt->xp_sslusec, xprt->xp_sslrefno, &reterr); } /* Must sorele() to get rid of reference. */ CURVNET_SET(xprt->xp_socket->so_vnet); SOCK_LOCK(xprt->xp_socket); sorele(xprt->xp_socket); CURVNET_RESTORE(); } else (void)soclose(xprt->xp_socket); } if (xprt->xp_netid) (void) mem_free(xprt->xp_netid, strlen(xprt->xp_netid) + 1); svc_xprt_free(xprt); } static void svc_vc_rendezvous_destroy(SVCXPRT *xprt) { SOLISTEN_LOCK(xprt->xp_socket); if (xprt->xp_upcallset) { xprt->xp_upcallset = 0; solisten_upcall_set(xprt->xp_socket, NULL, NULL); } SOLISTEN_UNLOCK(xprt->xp_socket); svc_vc_destroy_common(xprt); } static void svc_vc_destroy(SVCXPRT *xprt) { struct cf_conn *cd = (struct cf_conn *)xprt->xp_p1; CLIENT *cl = (CLIENT *)xprt->xp_p2; SOCKBUF_LOCK(&xprt->xp_socket->so_rcv); if (xprt->xp_upcallset) { xprt->xp_upcallset = 0; if (xprt->xp_socket->so_rcv.sb_upcall != NULL) soupcall_clear(xprt->xp_socket, SO_RCV); } SOCKBUF_UNLOCK(&xprt->xp_socket->so_rcv); if (cl != NULL) CLNT_RELEASE(cl); svc_vc_destroy_common(xprt); if (cd->mreq) m_freem(cd->mreq); if (cd->mpending) m_freem(cd->mpending); mem_free(cd, sizeof(*cd)); } static void svc_vc_backchannel_destroy(SVCXPRT *xprt) { struct cf_conn *cd = (struct cf_conn *)xprt->xp_p1; struct mbuf *m, *m2; svc_xprt_free(xprt); m = cd->mreq; while (m != NULL) { m2 = m; m = m->m_nextpkt; m_freem(m2); } mem_free(cd, sizeof(*cd)); } /*ARGSUSED*/ static bool_t svc_vc_control(SVCXPRT *xprt, const u_int rq, void *in) { return (FALSE); } static bool_t svc_vc_rendezvous_control(SVCXPRT *xprt, const u_int rq, void *in) { return (FALSE); } static bool_t svc_vc_backchannel_control(SVCXPRT *xprt, const u_int rq, void *in) { return (FALSE); } static enum xprt_stat svc_vc_stat(SVCXPRT *xprt) { struct cf_conn *cd; cd = (struct cf_conn *)(xprt->xp_p1); if (cd->strm_stat == XPRT_DIED) return (XPRT_DIED); if (cd->mreq != NULL && cd->resid == 0 && cd->eor) return (XPRT_MOREREQS); if (soreadable(xprt->xp_socket)) return (XPRT_MOREREQS); return (XPRT_IDLE); } static bool_t svc_vc_ack(SVCXPRT *xprt, uint32_t *ack) { *ack = atomic_load_acq_32(&xprt->xp_snt_cnt); *ack -= sbused(&xprt->xp_socket->so_snd); return (TRUE); } static enum xprt_stat svc_vc_backchannel_stat(SVCXPRT *xprt) { struct cf_conn *cd; cd = (struct cf_conn *)(xprt->xp_p1); if (cd->mreq != NULL) return (XPRT_MOREREQS); return (XPRT_IDLE); } /* * If we have an mbuf chain in cd->mpending, try to parse a record from it, * leaving the result in cd->mreq. If we don't have a complete record, leave * the partial result in cd->mreq and try to read more from the socket. */ static int svc_vc_process_pending(SVCXPRT *xprt) { struct cf_conn *cd = (struct cf_conn *) xprt->xp_p1; struct socket *so = xprt->xp_socket; struct mbuf *m; /* * If cd->resid is non-zero, we have part of the * record already, otherwise we are expecting a record * marker. */ if (!cd->resid && cd->mpending) { /* * See if there is enough data buffered to * make up a record marker. Make sure we can * handle the case where the record marker is * split across more than one mbuf. */ size_t n = 0; uint32_t header; m = cd->mpending; while (n < sizeof(uint32_t) && m) { n += m->m_len; m = m->m_next; } if (n < sizeof(uint32_t)) { so->so_rcv.sb_lowat = sizeof(uint32_t) - n; return (FALSE); } m_copydata(cd->mpending, 0, sizeof(header), (char *)&header); header = ntohl(header); cd->eor = (header & 0x80000000) != 0; cd->resid = header & 0x7fffffff; m_adj(cd->mpending, sizeof(uint32_t)); } /* * Start pulling off mbufs from cd->mpending * until we either have a complete record or * we run out of data. We use m_split to pull * data - it will pull as much as possible and * split the last mbuf if necessary. */ while (cd->mpending && cd->resid) { m = cd->mpending; if (cd->mpending->m_next || cd->mpending->m_len > cd->resid) cd->mpending = m_split(cd->mpending, cd->resid, M_WAITOK); else cd->mpending = NULL; if (cd->mreq) m_last(cd->mreq)->m_next = m; else cd->mreq = m; while (m) { cd->resid -= m->m_len; m = m->m_next; } } /* * Block receive upcalls if we have more data pending, * otherwise report our need. */ if (cd->mpending) so->so_rcv.sb_lowat = INT_MAX; else so->so_rcv.sb_lowat = imax(1, imin(cd->resid, so->so_rcv.sb_hiwat / 2)); return (TRUE); } static bool_t svc_vc_recv(SVCXPRT *xprt, struct rpc_msg *msg, struct sockaddr **addrp, struct mbuf **mp) { struct cf_conn *cd = (struct cf_conn *) xprt->xp_p1; struct uio uio; struct mbuf *m, *ctrl; struct socket* so = xprt->xp_socket; XDR xdrs; int error, rcvflag; uint32_t reterr, xid_plus_direction[2]; struct cmsghdr *cmsg; struct tls_get_record tgr; enum clnt_stat ret; /* * Serialise access to the socket and our own record parsing * state. */ sx_xlock(&xprt->xp_lock); for (;;) { /* If we have no request ready, check pending queue. */ while (cd->mpending && (cd->mreq == NULL || cd->resid != 0 || !cd->eor)) { if (!svc_vc_process_pending(xprt)) break; } /* Process and return complete request in cd->mreq. */ if (cd->mreq != NULL && cd->resid == 0 && cd->eor) { /* * Now, check for a backchannel reply. * The XID is in the first uint32_t of the reply * and the message direction is the second one. */ if ((cd->mreq->m_len >= sizeof(xid_plus_direction) || m_length(cd->mreq, NULL) >= sizeof(xid_plus_direction)) && xprt->xp_p2 != NULL) { m_copydata(cd->mreq, 0, sizeof(xid_plus_direction), (char *)xid_plus_direction); xid_plus_direction[0] = ntohl(xid_plus_direction[0]); xid_plus_direction[1] = ntohl(xid_plus_direction[1]); /* Check message direction. */ if (xid_plus_direction[1] == REPLY) { clnt_bck_svccall(xprt->xp_p2, cd->mreq, xid_plus_direction[0]); cd->mreq = NULL; continue; } } xdrmbuf_create(&xdrs, cd->mreq, XDR_DECODE); cd->mreq = NULL; /* Check for next request in a pending queue. */ svc_vc_process_pending(xprt); if (cd->mreq == NULL || cd->resid != 0) { SOCKBUF_LOCK(&so->so_rcv); if (!soreadable(so)) xprt_inactive_self(xprt); SOCKBUF_UNLOCK(&so->so_rcv); } sx_xunlock(&xprt->xp_lock); if (! xdr_callmsg(&xdrs, msg)) { XDR_DESTROY(&xdrs); return (FALSE); } *addrp = NULL; *mp = xdrmbuf_getall(&xdrs); XDR_DESTROY(&xdrs); return (TRUE); } /* * If receiving is disabled so that a TLS handshake can be * done by the rpctlssd daemon, return FALSE here. */ rcvflag = MSG_DONTWAIT; if ((xprt->xp_tls & RPCTLS_FLAGS_HANDSHAKE) != 0) rcvflag |= MSG_TLSAPPDATA; tryagain: if (xprt->xp_dontrcv) { sx_xunlock(&xprt->xp_lock); return (FALSE); } /* * The socket upcall calls xprt_active() which will eventually * cause the server to call us here. We attempt to * read as much as possible from the socket and put * the result in cd->mpending. If the read fails, * we have drained both cd->mpending and the socket so * we can call xprt_inactive(). */ uio.uio_resid = 1000000000; uio.uio_td = curthread; ctrl = m = NULL; error = soreceive(so, NULL, &uio, &m, &ctrl, &rcvflag); if (error == EWOULDBLOCK) { /* * We must re-test for readability after * taking the lock to protect us in the case * where a new packet arrives on the socket * after our call to soreceive fails with * EWOULDBLOCK. */ SOCKBUF_LOCK(&so->so_rcv); if (!soreadable(so)) xprt_inactive_self(xprt); SOCKBUF_UNLOCK(&so->so_rcv); sx_xunlock(&xprt->xp_lock); return (FALSE); } /* * A return of ENXIO indicates that there is a * non-application data record at the head of the * socket's receive queue, for TLS connections. * This record needs to be handled in userland * via an SSL_read() call, so do an upcall to the daemon. */ if ((xprt->xp_tls & RPCTLS_FLAGS_HANDSHAKE) != 0 && error == ENXIO) { /* Disable reception. */ xprt->xp_dontrcv = TRUE; sx_xunlock(&xprt->xp_lock); ret = rpctls_srv_handlerecord(xprt->xp_sslsec, xprt->xp_sslusec, xprt->xp_sslrefno, &reterr); sx_xlock(&xprt->xp_lock); xprt->xp_dontrcv = FALSE; if (ret != RPC_SUCCESS || reterr != RPCTLSERR_OK) { /* * All we can do is soreceive() it and * then toss it. */ rcvflag = MSG_DONTWAIT; goto tryagain; } sx_xunlock(&xprt->xp_lock); xprt_active(xprt); /* Harmless if already active. */ return (FALSE); } if (error) { SOCKBUF_LOCK(&so->so_rcv); if (xprt->xp_upcallset) { xprt->xp_upcallset = 0; soupcall_clear(so, SO_RCV); } SOCKBUF_UNLOCK(&so->so_rcv); xprt_inactive_self(xprt); cd->strm_stat = XPRT_DIED; sx_xunlock(&xprt->xp_lock); return (FALSE); } if (!m) { /* * EOF - the other end has closed the socket. */ xprt_inactive_self(xprt); cd->strm_stat = XPRT_DIED; sx_xunlock(&xprt->xp_lock); return (FALSE); } /* Process any record header(s). */ if (ctrl != NULL) { cmsg = mtod(ctrl, struct cmsghdr *); if (cmsg->cmsg_type == TLS_GET_RECORD && cmsg->cmsg_len == CMSG_LEN(sizeof(tgr))) { memcpy(&tgr, CMSG_DATA(cmsg), sizeof(tgr)); /* * This should have been handled by * the rpctls_svc_handlerecord() * upcall. If not, all we can do is * toss it away. */ if (tgr.tls_type != TLS_RLTYPE_APP) { m_freem(m); m_free(ctrl); rcvflag = MSG_DONTWAIT | MSG_TLSAPPDATA; goto tryagain; } } m_free(ctrl); } if (cd->mpending) m_last(cd->mpending)->m_next = m; else cd->mpending = m; } } static bool_t svc_vc_backchannel_recv(SVCXPRT *xprt, struct rpc_msg *msg, struct sockaddr **addrp, struct mbuf **mp) { struct cf_conn *cd = (struct cf_conn *) xprt->xp_p1; struct ct_data *ct; struct mbuf *m; XDR xdrs; sx_xlock(&xprt->xp_lock); ct = (struct ct_data *)xprt->xp_p2; if (ct == NULL) { sx_xunlock(&xprt->xp_lock); return (FALSE); } mtx_lock(&ct->ct_lock); m = cd->mreq; if (m == NULL) { xprt_inactive_self(xprt); mtx_unlock(&ct->ct_lock); sx_xunlock(&xprt->xp_lock); return (FALSE); } cd->mreq = m->m_nextpkt; mtx_unlock(&ct->ct_lock); sx_xunlock(&xprt->xp_lock); xdrmbuf_create(&xdrs, m, XDR_DECODE); if (! xdr_callmsg(&xdrs, msg)) { XDR_DESTROY(&xdrs); return (FALSE); } *addrp = NULL; *mp = xdrmbuf_getall(&xdrs); XDR_DESTROY(&xdrs); return (TRUE); } static bool_t svc_vc_reply(SVCXPRT *xprt, struct rpc_msg *msg, struct sockaddr *addr, struct mbuf *m, uint32_t *seq) { XDR xdrs; struct mbuf *mrep; bool_t stat = TRUE; int error, len, maxextsiz; #ifdef KERN_TLS u_int maxlen; #endif /* * Leave space for record mark. */ mrep = m_gethdr(M_WAITOK, MT_DATA); mrep->m_data += sizeof(uint32_t); xdrmbuf_create(&xdrs, mrep, XDR_ENCODE); if (msg->rm_reply.rp_stat == MSG_ACCEPTED && msg->rm_reply.rp_acpt.ar_stat == SUCCESS) { if (!xdr_replymsg(&xdrs, msg)) stat = FALSE; else xdrmbuf_append(&xdrs, m); } else { stat = xdr_replymsg(&xdrs, msg); } if (stat) { m_fixhdr(mrep); /* * Prepend a record marker containing the reply length. */ M_PREPEND(mrep, sizeof(uint32_t), M_WAITOK); len = mrep->m_pkthdr.len; *mtod(mrep, uint32_t *) = htonl(0x80000000 | (len - sizeof(uint32_t))); /* For RPC-over-TLS, copy mrep to a chain of ext_pgs. */ if ((xprt->xp_tls & RPCTLS_FLAGS_HANDSHAKE) != 0) { /* * Copy the mbuf chain to a chain of * ext_pgs mbuf(s) as required by KERN_TLS. */ maxextsiz = TLS_MAX_MSG_SIZE_V10_2; #ifdef KERN_TLS if (rpctls_getinfo(&maxlen, false, false)) maxextsiz = min(maxextsiz, maxlen); #endif mrep = _rpc_copym_into_ext_pgs(mrep, maxextsiz); } atomic_add_32(&xprt->xp_snd_cnt, len); /* * sosend consumes mreq. */ error = sosend(xprt->xp_socket, NULL, NULL, mrep, NULL, 0, curthread); if (!error) { atomic_add_rel_32(&xprt->xp_snt_cnt, len); if (seq) *seq = xprt->xp_snd_cnt; stat = TRUE; } else atomic_subtract_32(&xprt->xp_snd_cnt, len); } else { m_freem(mrep); } XDR_DESTROY(&xdrs); return (stat); } static bool_t svc_vc_backchannel_reply(SVCXPRT *xprt, struct rpc_msg *msg, struct sockaddr *addr, struct mbuf *m, uint32_t *seq) { struct ct_data *ct; XDR xdrs; struct mbuf *mrep; bool_t stat = TRUE; int error, maxextsiz; #ifdef KERN_TLS u_int maxlen; #endif /* * Leave space for record mark. */ mrep = m_gethdr(M_WAITOK, MT_DATA); mrep->m_data += sizeof(uint32_t); xdrmbuf_create(&xdrs, mrep, XDR_ENCODE); if (msg->rm_reply.rp_stat == MSG_ACCEPTED && msg->rm_reply.rp_acpt.ar_stat == SUCCESS) { if (!xdr_replymsg(&xdrs, msg)) stat = FALSE; else xdrmbuf_append(&xdrs, m); } else { stat = xdr_replymsg(&xdrs, msg); } if (stat) { m_fixhdr(mrep); /* * Prepend a record marker containing the reply length. */ M_PREPEND(mrep, sizeof(uint32_t), M_WAITOK); *mtod(mrep, uint32_t *) = htonl(0x80000000 | (mrep->m_pkthdr.len - sizeof(uint32_t))); /* For RPC-over-TLS, copy mrep to a chain of ext_pgs. */ if ((xprt->xp_tls & RPCTLS_FLAGS_HANDSHAKE) != 0) { /* * Copy the mbuf chain to a chain of * ext_pgs mbuf(s) as required by KERN_TLS. */ maxextsiz = TLS_MAX_MSG_SIZE_V10_2; #ifdef KERN_TLS if (rpctls_getinfo(&maxlen, false, false)) maxextsiz = min(maxextsiz, maxlen); #endif mrep = _rpc_copym_into_ext_pgs(mrep, maxextsiz); } sx_xlock(&xprt->xp_lock); ct = (struct ct_data *)xprt->xp_p2; if (ct != NULL) error = sosend(ct->ct_socket, NULL, NULL, mrep, NULL, 0, curthread); else error = EPIPE; sx_xunlock(&xprt->xp_lock); if (!error) { stat = TRUE; } } else { m_freem(mrep); } XDR_DESTROY(&xdrs); return (stat); } static bool_t svc_vc_null() { return (FALSE); } static int svc_vc_soupcall(struct socket *so, void *arg, int waitflag) { SVCXPRT *xprt = (SVCXPRT *) arg; if (soreadable(xprt->xp_socket)) xprt_active(xprt); return (SU_OK); } static int svc_vc_rendezvous_soupcall(struct socket *head, void *arg, int waitflag) { SVCXPRT *xprt = (SVCXPRT *) arg; if (!TAILQ_EMPTY(&head->sol_comp)) xprt_active(xprt); return (SU_OK); } #if 0 /* * Get the effective UID of the sending process. Used by rpcbind, keyserv * and rpc.yppasswdd on AF_LOCAL. */ int __rpc_get_local_uid(SVCXPRT *transp, uid_t *uid) { int sock, ret; gid_t egid; uid_t euid; struct sockaddr *sa; sock = transp->xp_fd; sa = (struct sockaddr *)transp->xp_rtaddr; if (sa->sa_family == AF_LOCAL) { ret = getpeereid(sock, &euid, &egid); if (ret == 0) *uid = euid; return (ret); } else return (-1); } #endif